import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from scipy import stats
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn import feature_selection
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from scipy.sparse import csr_matrix
from numpy import count_nonzero
from scipy.sparse import lil_matrix
from scipy import sparse
from scipy.stats import uniform
import pandas as pd
import sys
import numpy as np
from numpy import mean
from numpy import std
from numpy.random import randn
from numpy.random import seed
from matplotlib import pyplot
from sklearn.mixture import GaussianMixture
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
To use if running with colab, otherwise, ignore.\ Please, put the files in the same folder in the same folder the notebook is.
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/'
MCF7_meta = pd.read_csv('MCF7_SmartS_MetaData.tsv',delimiter = '\t', index_col = 0)
HCC1806_meta = pd.read_csv('HCC1806_SmartS_MetaData.tsv',delimiter = '\t', index_col = 0)
MCF7_unf = pd.read_csv('MCF7_SmartS_Unfiltered_Data.txt', delimiter='\ ',engine='python', index_col=0)
HCC1806_unf = pd.read_csv("HCC1806_SmartS_Unfiltered_Data.txt",delimiter="\ ",engine='python',index_col=0)
MCF7_f_n = pd.read_csv('MCF7_SmartS_Filtered_Normalised_3000_Data_train.txt', delimiter = '\ ', index_col=0,engine='python')
HCC1806_f_n= pd.read_csv('HCC1806_SmartS_Filtered_Normalised_3000_Data_train.txt', delimiter = '\ ', index_col=0,engine='python')
MCF7_drop_f_n = pd.read_csv('MCF7_Filtered_Normalised_3000_Data_train.txt', delimiter = '\ ', index_col=0,engine='python')
HCC1806_drop_f_n = pd.read_csv('HCC1806_Filtered_Normalised_3000_Data_train.txt', delimiter = '\ ', index_col=0,engine='python')
The overall aim of the AI Lab is to build a classification device able to predict, by exploiting which genes are expressed in a particular cell and in what quantity, whether the cell is under normoxia condition or hypoxic condition.\ An hypoxic cell is a cell that doesn't receive much $O_2$. There are different stages of hypoxia, and the data used in this project have been obtained by sequencing cancer cells that had previously been subjected to two different possible conditions: normal oxygen level equal to (~21% of $O_2$) and hypoxic oxigen level (~1% of $O_2$).\ Being able to predict if a cell is in hypoxic condition could be of fundamental importance as cells under this condition have greater resistance to cancer treatments.
The datasets come from two different cell lines, HCC1806 and MCF7 respectively, both made up of cells from women with breast cancer.\ These data were obtained by sequencing (using Smart-Seq and Drop-Seq) the samples. After the sequencing process, we have some data showing whether a specific gene is present and in what quantity is expressed in that specific cell.
First, we will do an exploratory data analysis using both the complete (unfiltered) datasets and those already filtered and normalized.\ In this phase we tried to understand the size of the dataset and if there were particular behaviors to be studied in details later. Additionally, one of the main objectives of this first phase was to try to understand, since the dataset was very sparse, how the size of the dataset could be reduced without losing information.\ In the central part of our work, we exploited PCA and clustering to gain a deeper understanding and begin to identify which patterns could be crucial in identifying the condition of hypoxia.\ Finally, using the Support Vector Machine, logistic regression, random forests classifiers and neural networks, we developed a tool capable of classifying the condition under which a cell had been subjected.
Firstly we download the meta_data files that provide important information on how the experiments were conducted on the single cell.\ It is organized as follows: each row represents the sequencing of RNA done on a single cell using pcr technique. Each column specifies how the experiment was performed: 'cell line', 'plate', 'position of the cell', 'condition in which the cell was exposed and other information related to experiment'. The file's rows correspond to the column's name of the file we want to analyze.
# Print the dimensions of the file
print("Dataframe dimensions:", np.shape(MCF7_meta))
# >Display the first 5 rows
MCF7_meta.head()
The MCF7 metadata dataframe has 383 rows.
This means that, in the MCF7 unfiltered dataframe, we'll have 383 cells (columns).
# Display the bottom 5 rows
MCF7_meta.tail(5)
# To print the dimensions of the file
print("Dataframe dimensions:", np.shape(HCC1806_meta))
# To disply the first 5 rows
HCC1806_meta.head()
The HCC1806 metadata dataframe has 243 rows.
This means that, in the HCC1806 unfiltered dataframe, we'll have 243 cells (columns).
# To display the bottom 5 rows
HCC1806_meta.tail(5)
# Checking the data type
MCF7_meta.dtypes
# Checking the data type
HCC1806_meta.dtypes
We can distinguish between object type and integer type, the two cell lines have most of the characteristics in common except for 'Lane' in MCF7 and 'PCR Plate' in HCC1806
list(MCF7_meta.columns)
list(HCC1806_meta.columns)
# Finding the null values.
print(MCF7_meta.isnull().sum())
The output tells us that no row has all 0 values
The meta_data files are organized as follows: the rows are the sequencing of rna done on a single cell using pcr technique (x383 for MCF7, x243 for HCC1806). The columns specify how the experiment was performed, 'cell line', 'plate', position of the cell, condition[...]
We start by analyzing the MCF7 unfiltered dataset.
# To display the first 5 rows of the unfiltered MCF7 data
MCF7_unf.head()
# To print the dimensions of the dataset
print("Dataframe dimensions:", np.shape(MCF7_unf))
# To display the first column of the dataset
print("First column: ", MCF7_unf.iloc[ : , 0])
The dimension of the MCF7 SmartSeq unfiltered data set is of 22934 rows and 383 columns.
Each column corresponds to a specific gene while each column corresponds to a cell. Hence, in each entry we can retrieve how much of tha specific genes has been expressed in that specific cell. Looking at the name of the columns we can also find directly if that cell was under normoxia or hypoxia without looking at the metadata.
# To print the names
list(MCF7_unf.columns)
# To print the names of the genes
gene_symbls = MCF7_unf.index
print("Dataframe indexes: ", gene_symbls)
# To check the data types of the entries
MCF7_unf.dtypes
As shown, the genes are represented as an integer value that represents how much they are expressed in each cell
# To check the null values
print(MCF7_unf.isnull().sum())
# To remove the rows containing all 0 values
MCF7_unf2 = MCF7_unf.dropna()
print(MCF7_unf2.isnull().sum())
# To check the dimension of the new dataset without null values
print("Dimension: ", MCF7_unf2.shape)
print("Did the dataset change?\n", not(MCF7_unf2.equals(MCF7_unf)))
After dropping the rows and the column that contains only NAs and zeroes, we see that the dataframe maintains the same dimensions. No rows or columns were dropped, the two datasets are equal.
Checking, and dropping, duplicates rows can also be usefull.
# To check whether there are duplicate rows
MCF7_unf_noDup = MCF7_unf.drop_duplicates()
print("The shape without duplicates is: ", MCF7_unf_noDup.shape)
print("Did the dataset change?\n", not(MCF7_unf_noDup.equals(MCF7_unf)))
Dropping the duplicate rows has changed the dimensions of the dataframe, some rows were discarded.
Let's now look at some informations about the dataframe.
# To check the characteristics of each cell
MCF7_unf_describe = MCF7_unf.describe()
MCF7_unf_describe
At first glance it emerges that while there are many cells with a mean greater than 40 there are many others with a mean very close to zero, hence, it seems that in these cells very few genes have been expressed. These cells could exhibit this behavior because perhaps they will probably die shortly but this is a domain specific question which we will not answer in our analysis. In any case, cells with so few expressed genes are not useful for our classification device and therefore we could delete their respective columns. Another important feature that emerges is that some cells have the max value much higher than the mean, for example in the first column the mean is equal to 40.817651 while max is equal to 46744; this makes us think that in some cells there are genes that are extremely characteristic compared to others and these genes will be the ones of greatest interest for us but it is still early to say it ...
Plotting the distribution of the mean in the cells though an histogram confirms the fact that there are two different peaks: one between 0 and 10 and the other around 50 and so the majority of the cells have a very low mean or instead one with a value near 50.
# To plot an histogram with the means
sns.displot(data = MCF7_unf_describe.iloc[1], bins=50, color="teal")
plt.show()
As can be seen from the plot, the count of the means follows a bimodal distribution (with two peaks), one around 0 and the other around 50.\ Hence, already at this point, we can use a natural treshold to filter the cells where very few genes where expressed.
print(MCF7_unf.shape)
data_no_zero = MCF7_unf
data_no_zero = data_no_zero.drop(columns=data_no_zero.columns[data_no_zero.describe().iloc[1]<=30])
# data_no_zero = data_no_zero.drop(data_no_zero.loc[data_no_zero.mean(axis=1)<=50].index)
# data_no_zero.describe()
We plot below the count of the means where we filtered out the cells with mean value below 30.
# To plot an histogram with the means
sns.displot(data = data_no_zero.describe().iloc[1], bins=50, color="teal")
plt.show()
Another confirmation of this behaviour is given by the fact that in almost all cells the 50-percentile of the distribution is zero. Plotting instead the 75-percentile we can also see that there is a peak of cells that exibihit a 75-percentile near zero and another peak that has instead a 75-percentile near 30. Already from this basic plots we can start to formulate the hypothesis that the data at our disposal is sparse and highly not-symmetrical so we procede further is this direction.
# To plot a histogram with the 75-percentile
sns.displot(data = MCF7_unf_describe.iloc[6], bins=50, color="teal")
plt.show()
Plotting the violin plots for the first 50 samples it emerges again that there are some cells with a very short line that means that the distribution is highly concentrated near zero while the cells with a very long line are those whose tails of the distribution are 'heavy' and so there is a high probability of having data points very far from the median in this samples.
# To plot a violin plot of the first 50 cells
plt.figure(figsize=(16,4))
plot=sns.violinplot(data=MCF7_unf.iloc[:, :50],palette="Set3",cut=0)
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()
To get a better visual about the distibution of genes in some specific cells, it is usefull to plot some boxplots.
cnames = list(MCF7_unf.columns)
cnames[1]
sns.boxplot(x=MCF7_unf[cnames[1]])
plt.show()
cnames = list(MCF7_unf.columns)
cnames[1]
sns.boxplot(x=MCF7_unf[cnames[5]])
plt.show()
We can see both from the boxplots and the violin plots that the range of the values are much different from a cell to another. But we can see that most genes have 0 value.
Here we repeat the same analysis using the transpose of the data, in order to see which genes are highly expressed with respect to others
# To transpose the data
MCF7_unf_transposed = MCF7_unf.transpose()
MCF7_unf_transposed
# To check the information about the transposed dataset
MCF7_unf_transposed_describe = MCF7_unf_transposed.describe()
MCF7_unf_transposed_describe
# to plot a histogram with the mean of the expression of the genes
sns.displot(data = MCF7_unf_transposed_describe.iloc[1], bins=500, color="teal")
# we limit the x axis since most of the values are around 0
plt.xlim(0, 600)
plt.show()
From this plot it is simple to see that most of the genes are not present in most of the cells. The same can be seen for the 75% percetile.
# to plot a histogram with the 75% percentile of the expression of the genes
sns.displot(data = MCF7_unf_transposed_describe.iloc[6], bins=500, color="teal")
# we limit the x axis since most of the values are around 0
plt.xlim(0, 1000)
plt.show()
From this plots we can deduce that specific genes are not expressed in most of the cells.
# To plot a violin plot of the first 50 genes
df_small = MCF7_unf_transposed_describe.iloc[:, :50]
plt.figure(figsize=(16,4))
plot=sns.violinplot(data=df_small,palette="Set3",cut=0)
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()
Also the violin plots show that most of the probability of a gene to be near the 0 values is much higher than any other value.
In order to understand if there are some genes that are highly correlated we plot che a table with the correlation of the first 25 genes.
# To see the correlation between genes
df_new = MCF7_unf.transpose().iloc[:,:25]
plt.figure(figsize=(25,25))
sns.heatmap(df_new.corr(),cbar=True,annot=True,cmap='Blues')
plt.show()
From the table we deduce that the genes interact between them in a quite complex way: some are positively correlated while others are negatively correlated. Interestingly there are some genes that have a very high correlation; this could mean that they are the same gene or they are genes with a very high level of similarity.
We can now look for outliers. And let's try to eliminate them and look again at the violin plots.
# To find interquartile range
Q1 = MCF7_unf.quantile(0.25)
Q3 = MCF7_unf.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
# To find values in the interquartile range
df_noOut = MCF7_unf[~((MCF7_unf < (Q1 - 1.5 * IQR)) |(MCF7_unf > (Q3 + 1.5 * IQR))).any(axis=1)]
df_noOut.shape
df_noOut.head(3)
sns.boxplot(x=df_noOut[cnames[1]])
plt.show()
df_noOut_small = df_noOut.iloc[:, :50]
plt.figure(figsize=(16,4))
plot=sns.violinplot(data=df_noOut_small,palette="Set3",cut=0)
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()
As seen from the data, most of the genes are expressed in the last quartile. Hence, in this case, the outliers carry most of the information and by removing them we would lose a great part of it. That is due to the fact that the data is sparse, and that makes it harder to just exclude outliers from the dataset.
We have tried to quantify the sparsity using an index given by the difference between 1 and the ratio of nonzero values and the size of the data set. This tells us that around 60% of the data is 0, hence highly sparse.
# create a 2-D representation of the matrix
MCF7_arr = np.array(MCF7_unf)
print("Dense matrix representation: \n", MCF7_arr)
# calculate sparsity
sparsity = 1.0 - count_nonzero(MCF7_arr) / MCF7_arr.size
print(sparsity)
From here we can deduce that around 60% of the data is equal to zero.
Therefore, we tried to represent the data set as a sparse matrix using the csr_matrix() function, and analysed the differences in memory storage between the two representations. In practice, the original numpy array takes almost the double of the storage compared to the sparse matrix, that is why this representation can be an advantage.
# convert to sparse matrix representation
S = csr_matrix(MCF7_arr)
print("Sparse matrix: \n",S)
# To calculate the memory usage of the original dataset
print('MCF7_arr.nbytes: {0} bytes'.format(MCF7_arr.nbytes))
print('sys.getsizeof(MCF7_arr): {0} bytes'.format(sys.getsizeof(MCF7_arr)))
# To calculate the memory usage of the sparse matrix
S.data.nbytes + S.indptr.nbytes + S.indices.nbytes
# convert back to 2-D representation of the matrix
B = S.todense()
print("Dense matrix: \n", B)
B.nbytes
We're happy to see that the sparse matrix occupy less space than the original data (without loosing any information)
We then analysed the skewness and Kurtosis of a single cell of the data to check whether it is normalised, but because most of these values are not near 0, we can deduce that most of the cells have a distribution that differ much from the normal one.\
#np.sum(MCF7_unf_sum.iloc[4] != 0)
MCF7_noOut = MCF7_unf
cnames = MCF7_unf.columns
# To calculate the skewness
from scipy.stats import kurtosis, skew
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
colN = np.shape(MCF7_unf)[1]
colN
df_skew_cells = []
for i in range(colN) :
v_df = MCF7_unf[cnames[i]]
df_skew_cells += [skew(v_df)]
# df_skew_cells += [df[cnames[i]].skew()]
df_skew_cells
plt.figure(figsize=(10,7))
sns.histplot(df_skew_cells, bins=100, color="teal")
plt.xlabel('Skewness of single cells expression profiles - original df')
plt.show()
# To calculate the kurtosis
df_kurt_cells = []
for i in range(colN) :
v_df = MCF7_unf[cnames[i]]
df_kurt_cells += [kurtosis(v_df)]
# df_kurt_cells += [df[cnames[i]].kurt()]
df_kurt_cells
plt.figure(figsize=(10,7))
sns.histplot(df_kurt_cells, bins=100, color="teal")
plt.xlabel('Kurtosis of single cells expression profiles - original df')
plt.show()
# To check normality of the genes
df_small = MCF7_unf.transpose().iloc[:, 10:30] #just selecting part of the samples so run time not too long
plt.figure(figsize=(10,7))
sns.displot(data=df_small,palette="Set3",kind="kde", bw_adjust=2)
plt.show()
To show that the normality assumption is not satisfied we the qqplot() function.
# QQ Plot
from numpy.random import seed
from numpy.random import randn
from statsmodels.graphics.gofplots import qqplot
# seed the random number generator
seed(1)
# generate univariate observations
data = 5 * randn(100) + 50
# q-q plot
qqplot(MCF7_unf, line='s')
plt.show()
We compare the QQplot in the filtered and normalized case:
# QQ Plot
from numpy.random import seed
from numpy.random import randn
from statsmodels.graphics.gofplots import qqplot
# seed the random number generator
seed(1)
# generate univariate observations
data = 5 * randn(100) + 50
# q-q plot
qqplot(MCF7_f_n, line='s')
plt.show()
In reality, it's very unlikely to find data that are normally distributed. Also in this case, as we can see by plotting the QQ-plot, the data are not normally distributed, in fact, the data points in the plot are not close to the red line.
Another approach we took was to try to filter the data by dropping all the rows (genes) that have third quantile smaller than 95 and by dropping the columns (cells) that have third quantile smaller than 14. These two numbers allows us to get a dataset that has dimensions similar to the ones of the filtered and normalized dataset. Unfortunately, they do not coincide.
data = MCF7_unf
data_d = data.T.describe()
data_d_n = data.describe()
data1 = data.drop(data.loc[data_d.iloc[6]<=95].index)
print(data1.shape)
data2 = data1.drop(columns=data1.columns[data_d_n.iloc[6]<=14])
print(data2.shape)
print(len([i for i in list(MCF7_f_n.index) if i in list(data2.index)]))
#print(data2)
We can now try to compute the log_2 of all the dataset, after adding 1 to it (do don't have problems with the zeros), and see if it can be a good normalization by looking again to the plot of the skenwness and the kurtosis.
MCF7_unf_plus = MCF7_unf + 1
MCF7_unf_log2 = MCF7_unf_plus.apply(np.log2)
MCF7_unf_log2.describe()
Histogram of the mean
# To plot an histogram with the means
sns.displot(data = MCF7_unf_log2.describe().iloc[1], bins=50, color="teal")
plt.show()
Histogram of the third quantile
# To plot an histogram with the third quantile
sns.displot(data = MCF7_unf_log2.describe().iloc[6], bins=50, color="teal")
plt.show()
We can see that we still have many zeroes, but way less than before. We have also a much smaller range of values.
Let's now plot again the skeness and the curtosis of the cells, but of the dataset where we pwerformed the logarithms.
cnames = MCF7_unf_log2.columns
# To calculate the skewness
from scipy.stats import kurtosis, skew
colN = np.shape(MCF7_unf_log2)[1]
colN
df_skew_cells = []
for i in range(colN) :
v_df = MCF7_unf_log2[cnames[i]]
df_skew_cells += [skew(v_df)]
# df_skew_cells += [df[cnames[i]].skew()]
df_skew_cells
plt.figure(figsize=(10,7))
sns.histplot(df_skew_cells, bins=100, color="teal")
plt.xlabel('Skewness of single cells expression profiles - log_2 df')
plt.show()
# To calculate the kurtosis
df_kurt_cells = []
for i in range(colN) :
v_df = MCF7_unf_log2[cnames[i]]
df_kurt_cells += [kurtosis(v_df)]
# df_kurt_cells += [df[cnames[i]].kurt()]
df_kurt_cells
plt.figure(figsize=(10,7))
sns.histplot(df_kurt_cells, bins=100, color="teal")
plt.xlabel('Kurtosis of single cells expression profiles - log_2 df')
plt.show()
We can definetely see that the data is much more normalized because most of the skewnwss and curtosis (not of all cells but most) is equal to 0.
We continue by analyzing the HCC1806 dataset.
print("Dataframe dimensions:", np.shape(HCC1806_unf))
print("First column: ", HCC1806_unf.iloc[ : , 0])
Let's now look at some informations about the dataframe.
# To check the characteristics of each cell
HCC1806_unf_describe = HCC1806_unf.describe()
HCC1806_unf_describe
As in the MCF7 dataset, a feature that emerges is that some cells have the max value much higher than the mean, for example in the first column the mean is equal to 99.565695 while max is equal to 35477; this makes us think that in some cells there are genes that are extremely characteristic compared to others and these genes will be the ones of greatest interest for us but it is still early to say it ...
Plotting the count of the means in the cells through an histogram confirms the fact that there are several peaks, the greatest one being at around 45. The data is dishomogeneus.
# To plot an histogram with the means
sns.displot(HCC1806_unf_describe.iloc[1], bins=50, color="teal")
plt.show()
We can see that, differently than the MCF7 dataframe, less cells have mean of genes equal to 0.\ By plotting the third quantile, it can be seen that there is a peak at around 25.\ Much more interesting is the fact that, in general, it seems that the average 75% quantile is smaller than the mean. This is probably due to the fact that we have many cells with 0-value.
# To plot a histogram with the 75-percentile
sns.displot(data = HCC1806_unf_describe.iloc[6], bins=50, color="teal")
plt.show()
Plotting the violin plots for the first 50 samples it emerges again that there are some cells with a very short line that means that the distribution is highly concentrated near zero while the cells with a very long line are those whose tails of the distribution are 'heavy' and so there is a high probability of having data points very far from the median in this samples.
# To plot a violin plot of the first 50 cells
plt.figure(figsize=(16,4))
plot=sns.violinplot(data=HCC1806_unf.iloc[:, :50],palette="Set3",cut=0)
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()
To get a better visual about the distibution of genes in some specific cells, it is usefull to plot some boxplots.
cnames = list(HCC1806_unf.columns)
cnames[1]
sns.boxplot(x=HCC1806_unf[cnames[1]])
plt.show()
cnames = list(HCC1806_unf.columns)
cnames[1]
sns.boxplot(x=HCC1806_unf[cnames[5]])
plt.show()
We can see both from the boxplots and the violin plots that the range of the values are much different from a cell to another. But we can see that most genes have 0 value.
Here we repeat the same analysis using the transpose of the data, in order to see which genes are highly expressed with respect to others
# To transpose the data
HCC1806_unf_transposed = HCC1806_unf.transpose()
HCC1806_unf_transposed
# To check the information about the transposed dataset
HCC1806_unf_transposed_describe = HCC1806_unf_transposed.describe()
HCC1806_unf_transposed_describe
# to plot a histogram with the mean of the expression of the genes
sns.displot(data = HCC1806_unf_transposed_describe.iloc[1], bins=500)
# we limit the x axis becuase most of the value is near 0
plt.xlim(0, 1000)
plt.show()
From this plot it is simple to see that most of the genes are not present in most of the cells. The same can be seen for the 75% percetile.
# to plot a histogram with the 75% of the expression of the genes
sns.displot(data = HCC1806_unf_transposed_describe.iloc[6], bins=500)
# we limit the x axis becuase most of the value is near 0
plt.xlim(0, 1000)
plt.show()
From these plots we can deduce that specific genes are not expressed in most of the cells.
# To plot a violin plot of the first 50 genes
df_small = HCC1806_unf_transposed_describe.iloc[:, :50]
plt.figure(figsize=(16,4))
plot=sns.violinplot(data=df_small,palette="Set3",cut=0)
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()
Also the violin plots show us that the probability of a gene to be near the 0 values in most cells is much higher than any other probability.
# To see the correlation between genes
df_new = HCC1806_unf.transpose().iloc[:,:25]
plt.figure(figsize=(25,25))
sns.heatmap(df_new.corr(),cbar=True,annot=True,cmap='Blues')
plt.show()
Let's now look for outliners. and let's try to eliminate them and look at the violin plots./ We first find the interquartile range of the data by eliminating the first and last quantile.
# To find interquartile range
Q1 = HCC1806_unf.quantile(0.25)
Q3 = HCC1806_unf.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
# To find values in the interquartile range
df_noOut = HCC1806_unf[~((HCC1806_unf < (Q1 - 1.5 * IQR)) |(HCC1806_unf > (Q3 + 1.5 * IQR))).any(axis=1)]
df_noOut.shape
df_noOut.head(3)
cnames = df_noOut.columns
sns.boxplot(x=df_noOut[cnames[1]])
plt.show()
df_noOut_small = df_noOut.iloc[:, :50]
plt.figure(figsize=(16,4))
plot=sns.violinplot(data=df_noOut_small,palette="Set3",cut=0)
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()
As seen from the data, most of the genes are expressed in the last quartile. Hence, in this case, the outliers carry most of the information and by removing them we would lose a great part of it. That is due to the fact that the data is sparse, and that makes it harder to just exclude outliers from the dataset.
We have tried to quantify the sparsity using an index given by the difference between 1 and the ratio of nonzero values and the size of the data set. This tells us that around 60% of the data is 0, hence highly sparse.
# create a 2-D representation of the matrix
HCC1806_arr = np.array(HCC1806_unf)
print("Dense matrix representation: \n", HCC1806_arr)
# calculate sparsity
sparsity = 1.0 - count_nonzero(HCC1806_arr) / HCC1806_arr.size
print(sparsity)
From here we can deduce that around 56% of the data is equal to zero.
We then analysed the skewness and Kurtosis of a single cell of the data to check whether it is normalised, but because most of these values are not near 0, we can deduce that most of the cells have a distribution that differ much from the normal one.\
#np.sum(MCF7_unf_sum.iloc[4] != 0)
HCC1806_noOut = HCC1806_unf
cnames = HCC1806_unf.columns
# To calculate the skewness
from scipy.stats import kurtosis, skew
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
colN = np.shape(HCC1806_unf)[1]
colN
df_skew_cells = []
for i in range(colN) :
v_df = HCC1806_unf[cnames[i]]
df_skew_cells += [skew(v_df)]
# df_skew_cells += [df[cnames[i]].skew()]
df_skew_cells
plt.figure(figsize=(10,7))
sns.histplot(df_skew_cells, bins=100, color="teal")
plt.xlabel('Skewness of single cells expression profiles - original df')
plt.show()
# To calculate the kurtosis
df_kurt_cells = []
for i in range(colN) :
v_df = HCC1806_unf[cnames[i]]
df_kurt_cells += [kurtosis(v_df)]
# df_kurt_cells += [df[cnames[i]].kurt()]
df_kurt_cells
plt.figure(figsize=(10,7))
sns.histplot(df_kurt_cells, bins=100, color="teal")
plt.xlabel('Kurtosis of single cells expression profiles - original df')
plt.show()
# To check normality of the genes
df_small = HCC1806_unf.transpose().iloc[:, 10:30] #just selecting part of the samples so run time not too long
plt.figure(figsize=(10,7))
sns.displot(data=df_small,palette="Set3",kind="kde", bw_adjust=2)
plt.show()
To show that the normality assumption is not satisfied we the qqplot() function.
# QQ Plot
from numpy.random import seed
from numpy.random import randn
from statsmodels.graphics.gofplots import qqplot
# seed the random number generator
seed(1)
# generate univariate observations
data = 5 * randn(100) + 50
# q-q plot
qqplot(HCC1806_unf, line='s')
plt.show()
We compare the QQplot in the filtered and normalized case:
# QQ Plot
from numpy.random import seed
from numpy.random import randn
from statsmodels.graphics.gofplots import qqplot
# seed the random number generator
seed(1)
# generate univariate observations
data = 5 * randn(100) + 50
# q-q plot
qqplot(HCC1806_f_n, line='s')
plt.show()
In reality, it's very unlikely to find data that are normally distributed. Also in this case, as we can see by plotting the QQ-plot, the data are not normally distributed, in fact, the data points in the plot are not close to the red line.
We can now try to compute the log_2 of all the dataset, after adding 1 to it (do don't have problems with the zeros), and see if it can be a good normalization by looking again to the plot of the skenwness and the curtosis.
HCC1806_unf_plus = HCC1806_unf + 1
HCC1806_unf_log2 = HCC1806_unf_plus.apply(np.log2)
HCC1806_unf_log2.describe()
Histogram of the mean
# To plot an histogram with the means
sns.displot(data = HCC1806_unf_log2.describe().iloc[1], bins=50, color="teal")
plt.show()
Histogram of the third quantile
# To plot an histogram with the third quantile
sns.displot(data = HCC1806_unf_log2.describe().iloc[6], bins=50, color="teal")
plt.show()
We can see that we still have some zeroes, but way less than before. We have also a much smaller range of values.
Let's now plot again the skeness and the curtosis of the cells, but of the dataset where we pwerformed the logarithms.
cnames = HCC1806_unf_log2.columns
# To calculate the skewness
from scipy.stats import kurtosis, skew
colN = np.shape(HCC1806_unf_log2)[1]
colN
df_skew_cells = []
for i in range(colN) :
v_df = HCC1806_unf_log2[cnames[i]]
df_skew_cells += [skew(v_df)]
# df_skew_cells += [df[cnames[i]].skew()]
df_skew_cells
plt.figure(figsize=(10,7))
sns.histplot(df_skew_cells, bins=100, color="teal")
plt.xlabel('Skewness of single cells expression profiles - log_2 df')
plt.show()
# To calculate the kurtosis
df_kurt_cells = []
for i in range(colN) :
v_df = HCC1806_unf_log2[cnames[i]]
df_kurt_cells += [kurtosis(v_df)]
# df_kurt_cells += [df[cnames[i]].kurt()]
df_kurt_cells
plt.figure(figsize=(10,7))
sns.histplot(df_kurt_cells, bins=100, color="teal")
plt.xlabel('Kurtosis of single cells expression profiles - log_2 df')
plt.show()
We can definetely see that the data is much more normalized because most of the skewnwss and curtosis (not of all cells but most) is equal to 0. With respect to the MCF7 unfiltered dataset, the skewness of the new data is less near 0.
We now proceed by analysing the filtered and normalized data, in particular we look at the correlation between genes, differences between hypoxic and normoxic cells, their clustering and their skewness.
MCF7_f_n.head()
MCF7_f_n_describe = MCF7_f_n.T.describe()
MCF7_f_n_describe
We look at the statistical data about the transposed data to compare differences in expression among genes. There are still many cells where mean is equal to 0 and many cells where the 75% quantile is equal to 0.
sns.displot(data = MCF7_f_n_describe.iloc[1], bins=500, color="teal")
plt.xlim(-10, 2500)
plt.show()
sns.displot(data = MCF7_f_n_describe.iloc[6], bins=500, color="teal")
plt.xlim(-10, 2500)
plt.show()
# To see the correlation between genes
df_new = MCF7_f_n.transpose().iloc[:,:25]
plt.figure(figsize=(25,25))
sns.heatmap(df_new.corr(),cbar=True,annot=True,cmap='Blues')
plt.show()
It can be quite interesting to see if there are major differences between the gene expressions in cells under hypoxic or normoxic conditions.
# Hypoxic cells using filtered and normalized dataset
MCF7_Hypo_f_n = MCF7_f_n.loc[:, MCF7_f_n.columns.str.contains('Hypo')]
MCF7_Hypo_f_n
# To transpose the hypoxic dataset
MCF7_Hypo_f_n_trans = MCF7_Hypo_f_n.transpose()
MCF7_Hypo_f_n_trans
# To check the characteristics of the genes in the hypoxic cells
MCF7_Hypo_f_n_trans_d = MCF7_Hypo_f_n_trans.describe()
MCF7_Hypo_f_n_trans_d
We considered only the hypoxic cells and transposed the dataset to analyse the expression of the genes, in particular we immediately saw that some genes are expressed in mean with a value close to 0 and others are over 10,000, hence some genes seem more relevant to hypoxic cells.
We did the same procedure with the normoxic cells and saw that also in this case some genes are much more expressed than others.
# Normoxic cells using filtered and transposed data
MCF7_Norm_f_n = MCF7_f_n.loc[:, MCF7_f_n.columns.str.contains('Norm')]
MCF7_Norm_f_n
# To transpose the normoxic dataset
MCF7_Norm_f_n_trans = MCF7_Norm_f_n.transpose()
MCF7_Norm_f_n_trans
# To check the characteristics of the genes in normoxic cells
MCF7_Norm_f_n_trans_d = MCF7_Norm_f_n_trans.describe()
MCF7_Norm_f_n_trans_d
df_small = MCF7_Hypo_f_n.transpose().iloc[:, :50]
plt.figure(figsize=(16,4))
plot=sns.violinplot(data=df_small,palette="Set3",cut=0)
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()
df_small = MCF7_Norm_f_n.transpose().iloc[:, :50]
plt.figure(figsize=(16,4))
plot=sns.violinplot(data=df_small,palette="Set3",cut=0)
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()
From the filtered and normalized violin plots it emerges that some genes are much more expressed in hypoxic cells than in normoxic cells (and viceversa). It is difficult to see from just the first 50 genes in the violin plots, later we will analyze the difference between some specific genes.\ We can see, for example, that the mean and the 25% and 75% quantile for the CYP1B1 gene are very different between the hypoxic and normoxic cells.
Let's now look for the genes that appear, either in diff or ratio of the means, to be more important in the differentiation between hypoxic and normoxic cells.
# To calculate the difference in mean for each gene in hypoxic and normoxic cells
MCF7_f_n_meandiff = MCF7_Hypo_f_n_trans_d.iloc[1] - MCF7_Norm_f_n_trans_d.iloc[1]
MCF7_f_n_meandiff
# To sort the mean differences found
MCF7_f_n_meandiff_sorted = MCF7_f_n_meandiff.sort_values(ascending=False)
MCF7_f_n_meandiff_sorted
We want to compare the gene expression across cells, so we find the difference between the gene expression in mean for hypoxic and normoxic cells and sort them. The first values show us the genes that are much more expressed in hypoxic cells than in normoxic cells, whereas the last values shows us the ones much more expressed in normoxic cells.
Using the ratio of means
# To calculate the ratio between the means of each gene in hypoxic and normoxic cells
MCF7_f_n_meanratio = (MCF7_Hypo_f_n_trans_d.iloc[1])/(MCF7_Norm_f_n_trans_d.iloc[1])
MCF7_f_n_meanratio
# To sort the ratios between the means found
MCF7_f_n_meanratio_sorted = MCF7_f_n_meanratio.sort_values(ascending=False)
MCF7_f_n_meanratio_sorted
To get a more accurate comparison we repeat the same procedure by calculating the ratio between the mean gene expression in hypoxic and normoxic cells. Since some genes have mean expression 0, as a result we obtain both infinite and 0 values. Since in this case the comparison between the ratios is significant only with non-zero terms, we exclude such genes and compare the remaining ones.
# To remove the infinite values in the list
MCF7_f_n_meanratio_sorted.replace([np.inf, -np.inf], np.nan, inplace=True)
MCF7_f_n_meanratio_sorted.replace(0, np.nan, inplace=True)
MCF7_f_n_meanratio_sorted.dropna(inplace=True)
MCF7_f_n_meanratio_sorted
Because this is extremely aprooximate and because many ratios where either 0 or infinite, we compute the ratio betwen the log_2 of the mean + 1 and we return the values sorted.
MCF7_f_n_logratio = np.log2(MCF7_Hypo_f_n_trans_d.iloc[1] + 1) - np.log2(MCF7_Norm_f_n_trans_d.iloc[1] + 1)
MCF7_f_n_logratio.sort_values(ascending=False)
We now try to get the most important genes with a random forest to check wheter they are the same as the ones we found before.
data_test = MCF7_f_n
# Let's load the packages
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
#import shap
from matplotlib import pyplot as plt
plt.rcParams.update({'figure.figsize': (12.0, 8.0)})
plt.rcParams.update({'font.size': 14})
index = data_test.T.index
y = np.zeros((len(index)),dtype = 'int64')
for i in range(len(index)):
y[i] = not 'Hypo' in index[i] # 0 Hypo 1 Norm
X_train, X_test, y_train, y_test = train_test_split(data_test.T, y, test_size= 0.1, random_state= 42)
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
genes_importance = rf.feature_importances_
genes = {}
for n, g in zip(data_test.index, genes_importance):
genes[n] = g
print({k: v for k, v in sorted(genes.items(), key=lambda item: item[1], reverse=True)})
Apparently, the gene that is more important in determining wether a cell is under hypoxic or normoxic condition is the NDRG1, which was fifth in the other analysis.
# to see the less important ones
print({k: v for k, v in sorted(genes.items(), key=lambda item: item[1], reverse=False)})
To see the differences in distribution of the gene, we plot some boxplots.
Gene_Hypo = MCF7_Hypo_f_n_trans['"NDRG1"']
Gene_Hypo
Gene_Norm = MCF7_Norm_f_n_trans['"NDRG1"']
Gene_Norm
Gene_Hypo.describe()
Gene_Norm.describe()
plt.figure(figsize=(16,16))
sns.boxplot(data=[Gene_Norm, Gene_Hypo])
plt.ylim(-100, 10000)
plt.xticks(ticks = [0, 1], labels=["Normoxic", "Hypoxic"])
plt.title("NDRG1")
plt.show()
We can see from the plot that the gene NDRG1 has very different distribution between the hypoxic and the normoxic cells.
We now plot the gene CYP1A1 because it was the one that, from the previous analysis, seemed more important.
Gene_Hypo = MCF7_Hypo_f_n_trans['"CYP1A1"']
Gene_Hypo
Gene_Norm = MCF7_Norm_f_n_trans['"CYP1A1"']
Gene_Norm
Gene_Hypo.describe()
Gene_Norm.describe()
plt.figure(figsize=(16,16))
sns.boxplot(data=[Gene_Norm, Gene_Hypo])
plt.ylim(-100, 10000)
plt.xticks(ticks = [0, 1], labels=["Normoxic", "Hypoxic"])
plt.title("CYP1A1")
plt.show()
Again, we can see that the distribution of the genes under hypoxic and normoxic condition are very much different.
To better visualize the data, we performed a PCA in 2 dimensions and then used the data for K-means clustering. In such a way, we are trying to split the data in two groups and, by using different colours for hypoxic and normoxic cells, check whether the clustering assigned them correctly.
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
def plot_data_C(X, col):
for i in range(len(X)):
plt.plot(X[i][0], X[i][1], 'k.', markersize=2, c=col[i])
def plot_centroids_C(centroids, weights=None, circle_color='w', cross_color='k'):
if weights is not None:
centroids = centroids[weights > weights.max() / 10]
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='o', s=35, linewidths=8,
color=circle_color, zorder=10, alpha=0.9)
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=2, linewidths=12,
color=cross_color, zorder=11, alpha=1)
def plot_decision_boundaries(clusterer, X, col, resolution=1000, show_centroids=True,
show_xlabels=True, show_ylabels=True):
mins = X.min(axis=0) - 0.1
maxs = X.max(axis=0) + 0.1
xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),
np.linspace(mins[1], maxs[1], resolution))
Z = clusterer.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
cmap="Pastel2")
plt.contour(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
linewidths=1, colors='k')
plot_data_C(X, col)
if show_centroids:
plot_centroids_C(clusterer.cluster_centers_)
if show_xlabels:
plt.xlabel("$x_1$", fontsize=14)
else:
plt.tick_params(labelbottom=False)
if show_ylabels:
plt.ylabel("$x_2$", fontsize=14, rotation=0)
else:
plt.tick_params(labelleft=False)
pcaM = PCA(n_components = 2)
MCF7_lowT = pcaM.fit_transform(MCF7_f_n.T)
MCF7_ret = pcaM.inverse_transform(MCF7_lowT)
print(MCF7_lowT.shape)
col = []
for c in MCF7_f_n.columns:
if "Hypo" in c:
col.append("r")
else:
col.append("b")
print(len(col))
index = MCF7_f_n.T.index
MCF7_hyponormo = np.zeros((len(index)), dtype = 'int64')
for i in range(len(index)):
MCF7_hyponormo[i] = not 'Hypo' in index[i] # 0 Hypo 1 Norm
from sklearn import metrics
k = 2
for i in [42, 52, 28, 37, 67]:
kmeans = KMeans(n_clusters=k, random_state=i)
y_pred = kmeans.fit_predict(MCF7_lowT)
mapping = {}
for class_id in np.unique(MCF7_hyponormo):
mode, _ = stats.mode(y_pred[MCF7_hyponormo==class_id]) # look at all samples with a given label and study what is the most common label given by the Gaussian mixture
# print(stats.mode(y_pred[MCF7_hyponormo ==class_id]))
mapping[mode[0]] = class_id
mapping
y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])
# print(accuracy_score(y_pred, MCF7_hyponormo))
print("fraction of correct predictions", np.sum(y_pred==MCF7_hyponormo) / len(y_pred))
plt.figure(figsize=(8, 4))
plot_decision_boundaries(kmeans, MCF7_lowT, col)
plt.show()
We tried to use different random states to calculate an average score of our K-means clustering. We can see that it predicts correctly 96% of the data in every case, this is probably due to the fact that we have used a dataset obtained from a PCA in 2 dimensions.
kmeans_iter1 = KMeans(n_clusters=2, init="random", n_init=1,
algorithm="full", max_iter=1, random_state=20)
kmeans_iter2 = KMeans(n_clusters=2, init="random", n_init=1,
algorithm="full", max_iter=2, random_state=20)
kmeans_iter3 = KMeans(n_clusters=2, init="random", n_init=1,
algorithm="full", max_iter=3, random_state=20)
kmeans_iter10 = KMeans(n_clusters=2, init="random", n_init=1,
algorithm="full", max_iter=10, random_state=20)
kmeans_iter1.fit(MCF7_lowT)
kmeans_iter2.fit(MCF7_lowT)
kmeans_iter3.fit(MCF7_lowT)
kmeans_iter10.fit(MCF7_lowT)
plt.figure(figsize=(10, 8), tight_layout=True)
plt.subplot(321)
plot_decision_boundaries(kmeans_iter1, MCF7_lowT, col, show_xlabels=False, show_ylabels=False)
plt.title("max_iter: 1", fontsize=14)
plt.ylabel("$x_2$", fontsize=14, rotation=0)
plt.tick_params(labelbottom=False)
plt.subplot(322)
plot_decision_boundaries(kmeans_iter2, MCF7_lowT, col, show_centroids=False)
plot_centroids_C(kmeans_iter2.cluster_centers_)
plt.title("max_iter: 2", fontsize=14)
plt.subplot(323)
plot_decision_boundaries(kmeans_iter3, MCF7_lowT, col, show_centroids=False)
plot_centroids_C(kmeans_iter3.cluster_centers_)
plt.title("max_iter: 3", fontsize=14)
plt.subplot(324)
plot_decision_boundaries(kmeans_iter10, MCF7_lowT, col, show_centroids=False)
plot_centroids_C(kmeans_iter10.cluster_centers_)
plt.title("max_iter: 10", fontsize=14)
plt.show()
from sklearn import metrics
k = 2
accuracy_scores = []
for i in [42, 52, 28, 37, 67]:
kmeans = KMeans(n_clusters=k, random_state=i)
y_pred = kmeans.fit_predict(MCF7_f_n.T)
y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])
print(accuracy_score(MCF7_hyponormo, y_pred))
print("fraction of correct predictions", np.sum(y_pred==MCF7_hyponormo) / len(y_pred))
accuracy_scores.append(accuracy_score(MCF7_hyponormo, y_pred))
print("Average accuracy score:", sum(accuracy_scores)/len(accuracy_scores))
We then repeated the clustering using the dataset not processed by the PCA, obtaining a slightly higher accuracy score. This shows us that in this particular case, PCA in 2 dimensions keeps enough information from the original dataset.
from sklearn.cluster import kmeans_plusplus
k = 2
accuracy_scores = []
for i in [110, 42, 66, 98, 200]:
centroids, indices = kmeans_plusplus(MCF7_f_n.T.to_numpy(), n_clusters = 2, random_state=i)
kmeans = KMeans(n_clusters=k, random_state=i)
kmeans.fit(centroids)
y_pred = kmeans.predict(MCF7_f_n.T)
mapping = {}
for class_id in np.unique(MCF7_hyponormo):
mode, _ = stats.mode(y_pred[MCF7_hyponormo==class_id]) # look at all samples with a given label and study what is the most common label given by the Gaussian mixture
# print(stats.mode(y_pred[MCF7_hyponormo ==class_id]))
mapping[mode[0]] = class_id
y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])
print(accuracy_score(MCF7_hyponormo, y_pred))
#print("fraction of correct predictions", np.sum(y_pred==MCF7_hyponormo) / len(y_pred))
accuracy_scores.append(accuracy_score(MCF7_hyponormo, y_pred))
print("Average accuracy score:", sum(accuracy_scores)/len(accuracy_scores))
We finally tried to find the centroids using K-means++ and then used them to cluster the data. We can see that the result is really close to the previous case where we used K-means.
from scipy.stats import kurtosis, skew
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
cnames = list(MCF7_f_n.columns)
colN = np.shape(MCF7_f_n)[1]
colN
df_skew_cells = []
for i in range(colN) :
v_df = MCF7_f_n[cnames[i]]
df_skew_cells += [skew(v_df)]
# df_skew_cells += [df[cnames[i]].skew()]
print("Minimum skewness:", min(df_skew_cells))
g = sns.histplot(df_skew_cells,bins=100, color="teal")
sns.set(rc={'figure.figsize':(10, 5)})
g.set(xlim = (0, None))
plt.xlabel('Skewness of single cells expression profiles - filtered and normalized df')
plt.show()
Since the skewness tells us how much the gene expression of a single cell is normalized, by plotting the histogram of the skewness across the cells it's easy to see that most of the values are far from the origin, and in particular no value is smaller than 8.47, hence distributions are not normalized.
import random
data = MCF7_f_n
f, axes = plt.subplots(2, 2, figsize=(14, 9 ), sharex=True)
r = [np.argmin(df_skew_cells), np.argmax(df_skew_cells)]
r += random.sample(range(250), 2)
sns.distplot(data.iloc[:,r[0]], color="skyblue", ax=axes[0,0], bins=200)
sns.distplot(data.iloc[:,r[1]], color="olive", ax=axes[0,1], bins=200)
sns.distplot(data.iloc[:, r[2]], color="gold", ax=axes[1,0], bins=200)
sns.distplot(data.iloc[:,r[3]], color="teal", ax=axes[1,1], bins=200)
axes[0,0].set_xlim(0,1000)
for i, ax in zip(r, axes.reshape(-1)):
ax.text(x=0.97, y=0.97, transform=ax.transAxes, s="Skewness: %f" % data.iloc[:,i].skew(),\
fontweight='demibold', fontsize=10, verticalalignment='top', horizontalalignment='right',\
backgroundcolor='white', color='xkcd:poo brown')
ax.text(x=0.97, y=0.91, transform=ax.transAxes, s="Kurtosis: %f" % data.iloc[:,i].kurt(),\
fontweight='demibold', fontsize=10, verticalalignment='top', horizontalalignment='right',\
backgroundcolor='white', color='xkcd:dried blood')
f.tight_layout()
plt.show()
We then plotted the distribution of the genes of 2 random cells, with the distributions of the cells with greatest and smallest skewness, and calculated their skewness and kurtosis. As a result, we immediately see that none of them has normal distribution, probably due to the sparsity of the data.
We now proceed to analyze the HCC1806 dataset as we did for MCF7.
HCC1806_f_n.head()
HCC1806_f_n.describe()
HCC1806_f_n_describe = HCC1806_f_n.describe()
HCC1806_f_n_describe
Plotting the mean of the cells.
sns.displot(data = HCC1806_f_n_describe.iloc[1], bins=50, color="teal")
plt.show()
Plotting the 75% quantile of the cells.
sns.displot(data = HCC1806_f_n_describe.iloc[6], bins=50, color="teal")
plt.show()
It is nice to see how many cells have mean different than 0. Said so, because many of them have third-quantile equal to 0, it means that many genes will equal to 0.
Plotting now some informations about the genes.
HCC1806_f_n_describe = HCC1806_f_n.T.describe()
HCC1806_f_n_describe
Plotting the mean of the genes.
sns.displot(data = HCC1806_f_n_describe.iloc[1], bins=500, color="teal")
plt.xlim(-100, 2500)
plt.show()
Plotting the 75% quantile of the cells.
sns.displot(data = HCC1806_f_n_describe.iloc[6], bins=500, color="teal")
plt.xlim(-100, 2500)
plt.show()
Many genes have both mean and 75% quantile near to 0.
# To see the correlation between genes
df_new = HCC1806_f_n.transpose().iloc[:,:25]
plt.figure(figsize=(25,25))
sns.heatmap(df_new.corr(),cbar=True,annot=True,cmap='Blues')
plt.show()
HCC1806_Hypo_f_n = HCC1806_f_n.loc[:, HCC1806_f_n.columns.str.contains('Hypoxia')]
HCC1806_Hypo_f_n
HCC1806_Norm_f_n = HCC1806_f_n.loc[:, HCC1806_f_n.columns.str.contains('Normoxia')]
HCC1806_Norm_f_n
df_small = HCC1806_Norm_f_n.transpose().iloc[0:, :50]
plt.figure(figsize=(16,4))
plot=sns.violinplot(data=df_small,palette="Set3",cut=0)
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()
df_small = HCC1806_Hypo_f_n.transpose().iloc[0:, :50]
plt.figure(figsize=(16,4))
plot=sns.violinplot(data=df_small,palette="Set3",cut=0)
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()
Even though it is not as clear as in the MCF7 case, we can still see that there are some differences between the distibutions of genes in hypoxic or normoxic conditions.
We compute again the ration of the log_2 of the means + 1 to see what genes are more imporant than othern in classifying the hypoxic and normoxic condition. Later we also compute it with random forests.
HCC1806_Hypo_f_n_trans = HCC1806_Hypo_f_n.T
HCC1806_Norm_f_n_trans = HCC1806_Norm_f_n.T
HCC1806_Hypo_f_n_trans_d = HCC1806_Hypo_f_n.T.describe()
HCC1806_Norm_f_n_trans_d = HCC1806_Norm_f_n.T.describe()
HCC1806_f_n_logratio = np.log2(HCC1806_Hypo_f_n_trans_d.iloc[1] + 1) - np.log2(HCC1806_Norm_f_n_trans_d.iloc[1] + 1)
HCC1806_f_n_logratio.sort_values(ascending=False)
Apparently, the gene CA9 is the most important in the HCC1806 dataset.
We now try to get the most important genes with a random forest to check wheter they are the same as the ones we found before.
data_test = HCC1806_f_n
# Let's load the packages
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
#import shap
from matplotlib import pyplot as plt
plt.rcParams.update({'figure.figsize': (12.0, 8.0)})
plt.rcParams.update({'font.size': 14})
index = data_test.T.index
y = np.zeros((len(index)),dtype = 'int64')
for i in range(len(index)):
y[i] = not 'Hypo' in index[i] # 0 Hypo 1 Norm
X_train, X_test, y_train, y_test = train_test_split(data_test.T, y, test_size= 0.1, random_state= 42)
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
genes_importance = rf.feature_importances_
genes = {}
for n, g in zip(data_test.index, genes_importance):
genes[n] = g
print({k: v for k, v in sorted(genes.items(), key=lambda item: item[1], reverse=True)})
Apparently, the gene that is more important in determining wether a cell is under hypoxic or normoxic condition is the PGK1, which wasn't in the first genes in the other analysis.
# to see the less important ones.
print({k: v for k, v in sorted(genes.items(), key=lambda item: item[1], reverse=False)})
To see the differences in distribution of the gene, we plot some boxplots.
Gene_Hypo = HCC1806_Hypo_f_n_trans['"PGK1"']
Gene_Hypo
Gene_Norm = HCC1806_Norm_f_n_trans['"PGK1"']
Gene_Norm
Gene_Hypo.describe()
Gene_Norm.describe()
plt.figure(figsize=(16,16))
sns.boxplot(data=[Gene_Norm, Gene_Hypo])
plt.xticks(ticks = [0, 1], labels=["Normoxic", "Hypoxic"])
plt.title("PGK1")
plt.show()
We can see from the plot that the gene PGK1 has very different distribution between the hypoxic and the normoxic cells.
We now plot the gene CA9 because it was the one that, from the previous analysis, seemed more important.
Gene_Hypo = HCC1806_Hypo_f_n_trans['"CA9"']
Gene_Hypo
Gene_Norm = HCC1806_Norm_f_n_trans['"CA9"']
Gene_Norm
Gene_Hypo.describe()
Gene_Norm.describe()
plt.figure(figsize=(16,16))
sns.boxplot(data=[Gene_Norm, Gene_Hypo])
plt.xticks(ticks = [0, 1], labels=["Normoxic", "Hypoxic"])
plt.title("CA9")
plt.show()
Again, we can see that the distribution of the genes under hypoxic and normoxic condition are very much different.\ But it is quite nice to see that we didn't need to use any ylim. We can interpret this by saying that, in HCC1806, the outliners have less important meaning in finding wether a cell is hypoxic or normoxic than in MCF7.
To better visualize the data, we performed a PCA in 2 dimensions and then used the data for K-means clustering. In such a way, we are trying to split the data in two groups and, by using different colours for hypoxic and normoxic cells, check whether the clustering assigned them correctly.
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
def plot_data_C(X, col):
for i in range(len(X)):
plt.plot(X[i][0], X[i][1], 'k.', markersize=2, c=col[i])
def plot_centroids_C(centroids, weights=None, circle_color='w', cross_color='k'):
if weights is not None:
centroids = centroids[weights > weights.max() / 10]
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='o', s=35, linewidths=8,
color=circle_color, zorder=10, alpha=0.9)
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=2, linewidths=12,
color=cross_color, zorder=11, alpha=1)
def plot_decision_boundaries(clusterer, X, col, resolution=1000, show_centroids=True,
show_xlabels=True, show_ylabels=True):
mins = X.min(axis=0) - 0.1
maxs = X.max(axis=0) + 0.1
xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),
np.linspace(mins[1], maxs[1], resolution))
Z = clusterer.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
cmap="Pastel2")
plt.contour(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
linewidths=1, colors='k')
plot_data_C(X, col)
if show_centroids:
plot_centroids_C(clusterer.cluster_centers_)
if show_xlabels:
plt.xlabel("$x_1$", fontsize=14)
else:
plt.tick_params(labelbottom=False)
if show_ylabels:
plt.ylabel("$x_2$", fontsize=14, rotation=0)
else:
plt.tick_params(labelleft=False)
pcaM = PCA(n_components = 2)
HCC1806_lowT = pcaM.fit_transform(HCC1806_f_n.T)
HCC1806_ret = pcaM.inverse_transform(HCC1806_lowT)
print(HCC1806_lowT.shape)
col = []
for c in HCC1806_f_n.columns:
if "Hypo" in c:
col.append("r")
else:
col.append("b")
print(len(col))
index = HCC1806_f_n.T.index
HCC1806_hyponormo = np.zeros((len(index)), dtype = 'int64')
for i in range(len(index)):
HCC1806_hyponormo[i] = not 'Hypo' in index[i] # 0 Hypo 1 Norm
print(HCC1806_hyponormo)
from sklearn import metrics
k = 2
for i in [42, 44, 28, 36, 40]:
kmeans = KMeans(n_clusters=k, random_state=i)
y_pred = kmeans.fit_predict(HCC1806_lowT)
mapping = {}
for class_id in np.unique(HCC1806_hyponormo):
mode, _ = stats.mode(y_pred[HCC1806_hyponormo==class_id]) # look at all samples with a given label and study what is the most common label given by the Gaussian mixture
mapping[mode[0]] = class_id
# print([mapping[cluster_id] for cluster_id in y_pred if cluster_id in mapping.keys else val = 1 - val])
# y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])
# print(accuracy_score(y_pred, MCF7_hyponormo))
print("fraction of correct predictions", np.sum(y_pred==HCC1806_hyponormo) / len(y_pred))
plt.figure(figsize=(8, 4))
plot_decision_boundaries(kmeans, HCC1806_lowT, col)
plt.show()
We tried to use different random states to calculate an average score of our K-means clustering. We can see that it predicts correctly only around 48% of the data in every case, this is probably due to the fact that a PCA in 2 dimention retain low variance in this particular dataset.
Let's now try clustering without applying PCA.
from sklearn import metrics
k = 2
accuracy_scores = []
for i in [42, 52, 28, 37, 67]:
kmeans = KMeans(n_clusters=k, random_state=i)
y_pred = kmeans.fit_predict(HCC1806_f_n.T)
mapping = {}
for class_id in np.unique(HCC1806_hyponormo):
mode, _ = stats.mode(y_pred[HCC1806_hyponormo==class_id]) # look at all samples with a given label and study what is the most common label given by the Gaussian mixture
mapping[mode[0]] = class_id
# y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])
print(accuracy_score(HCC1806_hyponormo, y_pred))
print("fraction of correct predictions", np.sum(y_pred==HCC1806_hyponormo) / len(y_pred))
accuracy_scores.append(accuracy_score(HCC1806_hyponormo, y_pred))
print("Average accuracy score:", sum(accuracy_scores)/len(accuracy_scores))
We then repeated the clustering using the dataset not processed by the PCA, obtaining a slightly higher accuracy score. Even using the whole dataset, the clustering accuracy is still pretty low.
from sklearn.cluster import kmeans_plusplus
k = 2
accuracy_scores = []
for i in [110, 42, 66, 80, 120]:
centroids, indices = kmeans_plusplus(HCC1806_f_n.T.to_numpy(), n_clusters = 2, random_state=i)
kmeans = KMeans(n_clusters=k, random_state=i)
kmeans.fit(centroids)
y_pred = kmeans.predict(HCC1806_f_n.T)
mapping = {}
for class_id in np.unique(HCC1806_hyponormo):
mode, _ = stats.mode(y_pred[HCC1806_hyponormo==class_id]) # look at all samples with a given label and study what is the most common label given by the Gaussian mixture
mapping[mode[0]] = class_id
y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])
print(accuracy_score(HCC1806_hyponormo, y_pred))
#print("fraction of correct predictions", np.sum(y_pred==MCF7_hyponormo) / len(y_pred))
accuracy_scores.append(accuracy_score(HCC1806_hyponormo, y_pred))
print("Average accuracy score:", sum(accuracy_scores)/len(accuracy_scores))
We finally tried to find the centroids using K-means++ and then used them to cluster the data. We can see that the result is much better than the previous case where we used K-means.
MCF7_drop_f_n.describe()
From the data we can immediately notice that the mean of genes expressed and the maximum value is much lower compared to the data obtained using SmartSeq
Now we will analize the difference between the hypoxic and normoxic cells of a particular gene.
MCF7_drop_Hypo_f_n = MCF7_drop_f_n.loc[:, MCF7_drop_f_n.columns.str.contains('Hypoxia')]
MCF7_drop_Hypo_f_n
MCF7_drop_Norm_f_n = MCF7_drop_f_n.loc[:, MCF7_drop_f_n.columns.str.contains('Normoxia')]
MCF7_drop_Norm_f_n
MCF7_drop_Hypo_f_n_trans = MCF7_drop_Hypo_f_n.T
MCF7_drop_Norm_f_n_trans = MCF7_drop_Norm_f_n.T
MCF7_drop_Hypo_f_n_trans_d = MCF7_drop_Hypo_f_n.T.describe()
MCF7_drop_Norm_f_n_trans_d = MCF7_drop_Norm_f_n.T.describe()
MCF7_drop_f_n_logratio = np.log2(MCF7_drop_Hypo_f_n_trans_d.iloc[1] + 1) - np.log2(MCF7_drop_Norm_f_n_trans_d.iloc[1] + 1)
MCF7_drop_f_n_logratio.sort_values(ascending=False)
Apparently, the gene TFF1 is the most important in the MCF7 dropseq dataset.
We now try to get the most important genes with a random forest to check wheter they are the same as the ones we found before.
data_test = MCF7_drop_f_n
# Let's load the packages
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
#import shap
from matplotlib import pyplot as plt
plt.rcParams.update({'figure.figsize': (12.0, 8.0)})
plt.rcParams.update({'font.size': 14})
index = data_test.T.index
y = np.zeros((len(index)),dtype = 'int64')
for i in range(len(index)):
y[i] = not 'Hypo' in index[i] # 0 Hypo 1 Norm
X_train, X_test, y_train, y_test = train_test_split(data_test.T, y, test_size= 0.1, random_state= 42)
rf = RandomForestRegressor(n_estimators=10)
rf.fit(X_train, y_train)
genes_importance = rf.feature_importances_
genes = {}
for n, g in zip(data_test.index, genes_importance):
genes[n] = g
print({k: v for k, v in sorted(genes.items(), key=lambda item: item[1], reverse=True)})
Apparently, the gene that is more important in determining wether a cell is under hypoxic or normoxic condition is still the TFF1, which wasn't in the first genes in the other analysis.
# to see the less important ones.
print({k: v for k, v in sorted(genes.items(), key=lambda item: item[1], reverse=False)})
To see the differences in distribution of the gene, we plot some boxplots.
Gene_Hypo = MCF7_drop_Hypo_f_n_trans['"TFF1"']
Gene_Hypo
Gene_Norm = MCF7_drop_Norm_f_n_trans['"TFF1"']
Gene_Norm
Gene_Hypo.describe()
Gene_Norm.describe()
plt.figure(figsize=(16,16))
sns.boxplot(data=[Gene_Norm, Gene_Hypo])
plt.xticks(ticks = [0, 1], labels=["Normoxic", "Hypoxic"])
plt.title("TFF1")
plt.show()
We can see from the plot that the gene TFF1 has very different distribution between the hypoxic and the normoxic cells. What we can definetely see is that the outliners are very different.
To better visualize the data, we performed a PCA in 2 dimensions and then used the data for K-means clustering. In such a way, we are trying to split the data in two groups and, by using different colours for hypoxic and normoxic cells, check whether the clustering assigned them correctly.
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
def plot_data_C(X, col):
for i in range(len(X)):
plt.plot(X[i][0], X[i][1], 'k.', markersize=2, c=col[i])
def plot_centroids_C(centroids, weights=None, circle_color='w', cross_color='k'):
if weights is not None:
centroids = centroids[weights > weights.max() / 10]
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='o', s=35, linewidths=8,
color=circle_color, zorder=10, alpha=0.9)
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=2, linewidths=12,
color=cross_color, zorder=11, alpha=1)
def plot_decision_boundaries(clusterer, X, col, resolution=1000, show_centroids=True,
show_xlabels=True, show_ylabels=True):
mins = X.min(axis=0) - 0.1
maxs = X.max(axis=0) + 0.1
xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),
np.linspace(mins[1], maxs[1], resolution))
Z = clusterer.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
cmap="Pastel2")
plt.contour(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
linewidths=1, colors='k')
plot_data_C(X, col)
if show_centroids:
plot_centroids_C(clusterer.cluster_centers_)
if show_xlabels:
plt.xlabel("$x_1$", fontsize=14)
else:
plt.tick_params(labelbottom=False)
if show_ylabels:
plt.ylabel("$x_2$", fontsize=14, rotation=0)
else:
plt.tick_params(labelleft=False)
pcaM = PCA(n_components = 2)
MCF7_drop_lowT = pcaM.fit_transform(MCF7_drop_f_n.T)
MCF7_drop_ret = pcaM.inverse_transform(MCF7_drop_lowT)
print(MCF7_drop_lowT.shape)
col = []
for c in MCF7_drop_f_n.columns:
if "Hypo" in c:
col.append("r")
else:
col.append("b")
print(len(col))
index = MCF7_drop_f_n.T.index
MCF7_drop_hyponormo = np.zeros((len(index)), dtype = 'int64')
for i in range(len(index)):
MCF7_drop_hyponormo[i] = not 'Hypo' in index[i] # 0 Hypo 1 Norm
from sklearn import metrics
k = 2
for i in [45, 52, 30, 37, 67]:
kmeans = KMeans(n_clusters=k, random_state=i)
y_pred = kmeans.fit_predict(MCF7_drop_lowT)
mapping = {}
for class_id in np.unique(MCF7_drop_hyponormo):
mode, _ = stats.mode(y_pred[MCF7_drop_hyponormo==class_id]) # look at all samples with a given label and study what is the most common label given by the Gaussian mixture
mapping[mode[0]] = class_id
#y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])
# print(accuracy_score(y_pred, MCF7_hyponormo))
print("fraction of correct predictions", np.sum(y_pred==MCF7_drop_hyponormo) / len(y_pred))
plt.figure(figsize=(8, 4))
plot_decision_boundaries(kmeans, MCF7_drop_lowT, col)
plt.show()
We tried to use different random states to calculate an average score of our K-means clustering. We can see that it predicts correctly only around 44% of the data in every case, this is probably due to the fact that a PCA in 2 dimention retain low variance in this particular dataset.
Let's now try clustering without applying PCA.
from sklearn import metrics
k = 2
accuracy_scores = []
for i in [42, 48, 28, 37, 67]:
kmeans = KMeans(n_clusters=k, random_state=i)
y_pred = kmeans.fit_predict(MCF7_drop_f_n.T)
mapping = {}
for class_id in np.unique(MCF7_drop_hyponormo):
mode, _ = stats.mode(y_pred[MCF7_drop_hyponormo==class_id]) # look at all samples with a given label and study what is the most common label given by the Gaussian mixture
mapping[mode[0]] = class_id
# y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])
print(accuracy_score(MCF7_drop_hyponormo, y_pred))
print("fraction of correct predictions", np.sum(y_pred==MCF7_drop_hyponormo) / len(y_pred))
accuracy_scores.append(accuracy_score(MCF7_drop_hyponormo, y_pred))
print("Average accuracy score:", sum(accuracy_scores)/len(accuracy_scores))
We then repeated the clustering using the dataset not processed by the PCA, obtaining a slightly higher accuracy score. Even using the whole dataset, the clustering accuracy is still pretty low.
from sklearn.cluster import kmeans_plusplus
k = 2
accuracy_scores = []
for i in [85, 42, 65, 98, 200]:
centroids, indices = kmeans_plusplus(MCF7_drop_f_n.T.to_numpy(), n_clusters = 2, random_state=i)
kmeans = KMeans(n_clusters=k, random_state=i)
kmeans.fit(centroids)
y_pred = kmeans.predict(MCF7_drop_f_n.T)
mapping = {}
for class_id in np.unique(MCF7_drop_hyponormo):
mode, _ = stats.mode(y_pred[MCF7_drop_hyponormo==class_id]) # look at all samples with a given label and study what is the most common label given by the Gaussian mixture
mapping[mode[0]] = class_id
# y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])
print(accuracy_score(MCF7_drop_hyponormo, y_pred))
#print("fraction of correct predictions", np.sum(y_pred==MCF7_hyponormo) / len(y_pred))
accuracy_scores.append(accuracy_score(MCF7_drop_hyponormo, y_pred))
print("Average accuracy score:", sum(accuracy_scores)/len(accuracy_scores))
We finally tried to find the centroids using K-means++ and then used them to cluster the data. We can see that the result is a little better than the previous case where we used K-means.
HCC1806_drop_f_n_describe = HCC1806_drop_f_n.describe()
HCC1806_drop_f_n_describe
sns.displot(data = HCC1806_drop_f_n_describe.iloc[1], bins=50, color="teal")
plt.show()
sns.displot(data = HCC1806_drop_f_n_describe.iloc[6], bins=50, color="teal")
plt.show()
HCC1806_drop_Hypo_f_n = HCC1806_drop_f_n.loc[:, HCC1806_drop_f_n.columns.str.contains('Hypoxia')]
HCC1806_drop_Hypo_f_n
HCC1806_drop_Norm_f_n = HCC1806_drop_f_n.loc[:, HCC1806_drop_f_n.columns.str.contains('Normoxia')]
HCC1806_drop_Norm_f_n
df_small = HCC1806_drop_Norm_f_n.transpose().iloc[0:, :50]
plt.figure(figsize=(16,4))
plot=sns.violinplot(data=df_small,palette="Set3",cut=0)
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()
df_small = HCC1806_drop_Hypo_f_n.transpose().iloc[0:, :50]
plt.figure(figsize=(16,4))
plot=sns.violinplot(data=df_small,palette="Set3",cut=0)
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()
It is quite hard to see a major difference.\ We again compute the ratio of the log_2 of the mean + 1.
HCC1806_drop_Hypo_f_n_trans = HCC1806_drop_Hypo_f_n.T
HCC1806_drop_Norm_f_n_trans = HCC1806_drop_Norm_f_n.T
HCC1806_drop_Hypo_f_n_trans_d = HCC1806_drop_Hypo_f_n.T.describe()
HCC1806_drop_Norm_f_n_trans_d = HCC1806_drop_Norm_f_n.T.describe()
HCC1806_drop_f_n_logratio = np.log2(HCC1806_drop_Hypo_f_n_trans_d.iloc[1] + 1) - np.log2(HCC1806_drop_Norm_f_n_trans_d.iloc[1] + 1)
HCC1806_drop_f_n_logratio.sort_values(ascending=False)
Apparently, the gene IGFBP3 is the most important in the HCC1806 dropseq dataset.
We now try to get the most important genes with a random forest to check wheter they are the same as the ones we found before.
data_test = HCC1806_drop_f_n
# Let's load the packages
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
#import shap
from matplotlib import pyplot as plt
plt.rcParams.update({'figure.figsize': (12.0, 8.0)})
plt.rcParams.update({'font.size': 14})
index = data_test.T.index
y = np.zeros((len(index)),dtype = 'int64')
for i in range(len(index)):
y[i] = not 'Hypo' in index[i] # 0 Hypo 1 Norm
X_train, X_test, y_train, y_test = train_test_split(data_test.T, y, test_size= 0.1, random_state= 42)
rf = RandomForestRegressor(n_estimators=10)
rf.fit(X_train, y_train)
genes_importance = rf.feature_importances_
genes = {}
for n, g in zip(data_test.index, genes_importance):
genes[n] = g
print({k: v for k, v in sorted(genes.items(), key=lambda item: item[1], reverse=True)})
Apparently, the gene that is more important in determining wether a cell is under hypoxic or normoxic condition is still the NDRG1, which wasn't in the first genes in the other analysis.
# to see the less important ones.
print({k: v for k, v in sorted(genes.items(), key=lambda item: item[1], reverse=False)})
To see the differences in distribution of the gene, we plot some boxplots.
Gene_Hypo = HCC1806_drop_Hypo_f_n_trans['"NDRG1"']
Gene_Hypo
Gene_Norm = HCC1806_drop_Norm_f_n_trans['"NDRG1"']
Gene_Norm
Gene_Hypo.describe()
Gene_Norm.describe()
plt.figure(figsize=(16,16))
sns.boxplot(data=[Gene_Norm, Gene_Hypo])
plt.xticks(ticks = [0, 1], labels=["Normoxic", "Hypoxic"])
plt.title("NDRG1")
plt.show()
We can see from the plot that the gene NDRG1 has very different distribution between the hypoxic and the normoxic cells. What we can definetely see is that the outliners are very different.
Let's now plot the one we got from the first analysis.
Gene_Hypo = HCC1806_drop_Hypo_f_n_trans['"IGFBP3"']
Gene_Hypo
Gene_Norm = HCC1806_drop_Norm_f_n_trans['"IGFBP3"']
Gene_Norm
plt.figure(figsize=(16,16))
sns.boxplot(data=[Gene_Norm, Gene_Hypo])
plt.xticks(ticks = [0, 1], labels=["Normoxic", "Hypoxic"])
plt.title("IGFBP3")
plt.show()
We can see that the distribution is very different.
To better visualize the data, we performed a PCA in 2 dimensions and then used the data for K-means clustering. In such a way, we are trying to split the data in two groups and, by using different colours for hypoxic and normoxic cells, check whether the clustering assigned them correctly.
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
def plot_data_C(X, col):
for i in range(len(X)):
plt.plot(X[i][0], X[i][1], 'k.', markersize=2, c=col[i])
def plot_centroids_C(centroids, weights=None, circle_color='w', cross_color='k'):
if weights is not None:
centroids = centroids[weights > weights.max() / 10]
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='o', s=35, linewidths=8,
color=circle_color, zorder=10, alpha=0.9)
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=2, linewidths=12,
color=cross_color, zorder=11, alpha=1)
def plot_decision_boundaries(clusterer, X, col, resolution=1000, show_centroids=True,
show_xlabels=True, show_ylabels=True):
mins = X.min(axis=0) - 0.1
maxs = X.max(axis=0) + 0.1
xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),
np.linspace(mins[1], maxs[1], resolution))
Z = clusterer.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
cmap="Pastel2")
plt.contour(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),
linewidths=1, colors='k')
plot_data_C(X, col)
if show_centroids:
plot_centroids_C(clusterer.cluster_centers_)
if show_xlabels:
plt.xlabel("$x_1$", fontsize=14)
else:
plt.tick_params(labelbottom=False)
if show_ylabels:
plt.ylabel("$x_2$", fontsize=14, rotation=0)
else:
plt.tick_params(labelleft=False)
pcaM = PCA(n_components = 2)
HCC1806_drop_lowT = pcaM.fit_transform(HCC1806_drop_f_n.T)
HCC1806_drop_ret = pcaM.inverse_transform(HCC1806_drop_lowT)
print(HCC1806_drop_lowT.shape)
col = []
for c in HCC1806_drop_f_n.columns:
if "Hypo" in c:
col.append("r")
else:
col.append("b")
print(len(col))
index = HCC1806_drop_f_n.T.index
HCC1806_drop_hyponormo = np.zeros((len(index)), dtype = 'int64')
for i in range(len(index)):
HCC1806_drop_hyponormo[i] = not 'Hypo' in index[i] # 0 Hypo 1 Norm
from sklearn import metrics
k = 2
for i in [42, 52, 46, 37, 75]:
kmeans = KMeans(n_clusters=k, random_state=i)
y_pred = kmeans.fit_predict(HCC1806_drop_lowT)
mapping = {}
for class_id in np.unique(HCC1806_drop_hyponormo):
mode, _ = stats.mode(y_pred[HCC1806_drop_hyponormo==class_id]) # look at all samples with a given label and study what is the most common label given by the Gaussian mixture
mapping[mode[0]] = class_id
y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])
# print(accuracy_score(y_pred, MCF7_hyponormo))
print("fraction of correct predictions", np.sum(y_pred==HCC1806_drop_hyponormo) / len(y_pred))
plt.figure(figsize=(8, 4))
plot_decision_boundaries(kmeans, HCC1806_drop_lowT, col)
plt.show()
We tried to use different random states to calculate an average score of our K-means clustering. We can see that it predicts correctly only around 57% of the data in every case, this is probably due to the fact that a PCA in 2 dimention retain low variance in this particular dataset.
Let's now try clustering without applying PCA.
from sklearn import metrics
k = 2
accuracy_scores = []
for i in [42, 52, 28, 37, 67]:
kmeans = KMeans(n_clusters=k, random_state=i)
y_pred = kmeans.fit_predict(HCC1806_drop_f_n.T)
mapping = {}
for class_id in np.unique(HCC1806_drop_hyponormo):
mode, _ = stats.mode(y_pred[HCC1806_drop_hyponormo==class_id]) # look at all samples with a given label and study what is the most common label given by the Gaussian mixture
mapping[mode[0]] = class_id
y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])
print(accuracy_score(HCC1806_drop_hyponormo, y_pred))
print("fraction of correct predictions", np.sum(y_pred==HCC1806_drop_hyponormo) / len(y_pred))
accuracy_scores.append(accuracy_score(HCC1806_drop_hyponormo, y_pred))
print("Average accuracy score:", sum(accuracy_scores)/len(accuracy_scores))
We then repeated the clustering using the dataset not processed by the PCA, obtaining a slightly higher accuracy score. Even using the whole dataset, the clustering accuracy is still pretty low.
from sklearn.cluster import kmeans_plusplus
k = 2
accuracy_scores = []
for i in [24, 176, 65, 98, 200]:
centroids, indices = kmeans_plusplus(HCC1806_drop_f_n.T.to_numpy(), n_clusters = 2, random_state=i)
kmeans = KMeans(n_clusters=k, random_state=i)
kmeans.fit(centroids)
y_pred = kmeans.predict(HCC1806_drop_f_n.T)
mapping = {}
for class_id in np.unique(HCC1806_drop_hyponormo):
mode, _ = stats.mode(y_pred[HCC1806_drop_hyponormo==class_id]) # look at all samples with a given label and study what is the most common label given by the Gaussian mixture
mapping[mode[0]] = class_id
# y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])
print(accuracy_score(HCC1806_drop_hyponormo, y_pred))
#print("fraction of correct predictions", np.sum(y_pred==MCF7_hyponormo) / len(y_pred))
accuracy_scores.append(accuracy_score(HCC1806_drop_hyponormo, y_pred))
print("Average accuracy score:", sum(accuracy_scores)/len(accuracy_scores))
We finally tried to find the centroids using K-means++ and then used them to cluster the data. We can see that the result is worse than the previous case where we used K-means.
To sum up, we have observed eight different datasets, four of MCF7 and four of HCC1806. Both the unfiltered data in SmartSeq were highly sparse (over 50% of zeros), and had a few duplicated rows but no null row.\ In addition, by looking at the gene expression, we observed that most of the data is contained in the outliers, and in each cell, a few genes are highly expressed.\ Both of the data sets are not normalized and overall the skewness is high for most of the cells (and never close to 0). Transformations such as PowerTransformer and QuantileTransformer didn’t work well, however applying the log_2 to the data translated by 1 reduced drastically the skewness for both cell lines.\ Then, by looking at the filtered and normalized data and by comparing gene expression between hypoxic and normoxic cells, we found that there are some genes that are particularly highly expressed in one of the two types, hence more “characteristic”. We confirmed it using random forests and then checked the accuracy of the classifications using K-means clustering. Overall, for MCF7, the differences between gene expression in normoxic and hypoxic cells were more evident, but both cell lines had similar characteristics. For HCC1806, the gene CA9 appeared to be the most important one using the log ratio method, whereas using random forests emerged PGK1. For MCF7 we obtained CYP1A1 with the log ratio and NDRG1 with the random forest. We concluded that outliers have less weight in determining hypoxic/normoxic compared to MCF7.\ Analysing the filtered and normalised data in DropSeq, we noticed that the mean of genes expressed is much lower. For MCF7, the most important gene seemed TFF1 both using the log ratio and the random forest methods. For HCC1806, IGFBP3 was the most important using the log-ratio and NDRG1 using the random forest.
We could also see how the clustering worked much better in the MCF7 SmartSeq dataset than any other.
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
From now on, we will use only the filtered and normalized datasets for our analysis. Starting with a PCA which maintains 95% of the variance explained, in the Smart-seq datesets, we obtain that MCF7 can be expressed by using only 20 directions while HCC1806 is expressed by using 34 directions.\ Regarding instead the Drop-seq experiment, we do not obtain very nice results, in fact, we need 162 directions for the MCF7 dataset in order to retain the 95% of the variance and 844 directions for the HCC1806. A confirmation of this comes from the fact that printing the explained variance ratio it can be seen that the variance explained is very low for the first directions.
pcaM = PCA(n_components = 0.95)
MCF7_low = pcaM.fit_transform(MCF7_f_n.T)
MCF7_ret = pcaM.inverse_transform(MCF7_low)
pcaH = PCA(n_components = 0.95)
HCC1806_low= pcaH.fit_transform(HCC1806_f_n.T)
print(MCF7_low.shape, HCC1806_low.shape)
pcaMD = PCA(n_components= 0.95)
MCF7_lowd = pcaMD.fit_transform(MCF7_drop_f_n.T)
MCF7_lowd.shape
pcaHD = PCA(n_components= 0.95)
HCC1806_f_n_drop_lowd = pcaHD.fit_transform(HCC1806_drop_f_n.T)
HCC1806_f_n_drop_lowd.shape
# Chiedere Dropseq --- Dropseq è un dataset molto corrotto
pcaHD.explained_variance_ratio_
To facilitate the following analysis we create an np.array that stores a binary value for each cell of the dataset: 0 if the cell is Hypo and 1 if it is Norm.
# MCF7 SmartSeq
index = MCF7_f_n.T.index
MCF7_hyponormo = np.zeros((len(index)), dtype = 'int64')
for i in range(len(index)):
MCF7_hyponormo[i] = not 'Hypo' in index[i] # 0 Hypo 1 Norm
# HCC SmartSeq
index = HCC1806_f_n.T.index
HCC1806_hyponormo = np.zeros((len(index)), dtype = 'int64')
for i in range(len(index)):
HCC1806_hyponormo[i] = not 'Hypo' in index[i] # 0 Hypo 1 Norm
# MCF7 DropSeq
index = MCF7_drop_f_n.T.index
MCF7_hyponorm = np.zeros((len(index)), dtype = 'int64')
for i in range(len(index)):
MCF7_hyponorm[i] = not 'Hypo' in index[i] # 0 Hypo 1 Norm
print(MCF7_hyponorm.size)
# HCC DropSeq
index = HCC1806_drop_f_n.T.index
HCC1806_hyponorm = np.zeros((len(index)), dtype = 'int64')
for i in range(len(index)):
HCC1806_hyponorm[i] = not 'Hypo' in index[i] # 0 Hypo 1 Norm
cumsum = np.cumsum(pcaM.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
d
Now, we plot the explained variance ratio for the MCF7 dataset (Smartseq) and it emerges quite evidently that using the first twenty directions the explained variance ratio is almost saturated.
plt.figure(figsize=(6,4))
plt.plot(cumsum, linewidth=3)
plt.axis([0, 50, 0, 1])
plt.xlabel("Dimensions")
plt.ylabel("Explained Variance")
plt.title("MCF7 SmartSeq")
plt.plot([d, d], [0, 0.95], "k:")
plt.plot([0, d], [0.95, 0.95], "k:")
plt.plot(d, 0.95, "ko")
plt.annotate("Elbow", xy=(65, 0.85), xytext=(70, 0.7),
arrowprops=dict(arrowstyle="->"), fontsize=16)
plt.grid(True)
# save_fig("explained_variance_plot")
plt.show()
HCC1806 SQ
cumsum = np.cumsum(pcaH.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
d
Now, we plot the explained variance ratio for the HCC1806 dataset (Smartseq) and it emerges quite evidently that using the first twenty directions the explained variance ratio is almost saturated.
plt.figure(figsize=(6,4))
plt.plot(cumsum, linewidth=3)
plt.axis([0, 50, 0, 1])
plt.xlabel("Dimensions")
plt.ylabel("Explained Variance")
plt.title("HCC1806 SmartSeq")
plt.plot([d, d], [0, 0.95], "k:")
plt.plot([0, d], [0.95, 0.95], "k:")
plt.plot(d, 0.95, "ko")
plt.annotate("Elbow", xy=(65, 0.85), xytext=(70, 0.7),
arrowprops=dict(arrowstyle="->"), fontsize=16)
plt.grid(True)
# save_fig("explained_variance_plot")
plt.show()
MCF7 DQ
cumsum = np.cumsum(pcaMD.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
d
Now, we plot the explained variance ratio for the MCF7 dataset (DropSeq) and it emerges quite evidently that using the first twenty directions the explained variance ratio is almost saturated.
plt.figure(figsize=(6,4))
plt.plot(cumsum, linewidth=3)
plt.axis([0, 1000, 0, 1])
plt.xlabel("Dimensions")
plt.ylabel("Explained Variance")
plt.title("MCF7 DropSeq")
plt.plot([d, d], [0, 0.95], "k:")
plt.plot([0, d], [0.95, 0.95], "k:")
plt.plot(d, 0.95, "ko")
# plt.annotate("Elbow", xy=(65, 0.85), xytext=(70, 0.7), arrowprops=dict(arrowstyle="->"), fontsize=16)
plt.grid(True)
# save_fig("explained_variance_plot")
plt.show()
HCC1806 DQ
cumsum = np.cumsum(pcaHD.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
d
Now, we plot the explained variance ratio for the HCC1806 dataset (DropSeq) and it emerges quite evidently that using the first twenty directions the explained variance ratio is almost saturated.
plt.figure(figsize=(6,4))
plt.plot(cumsum, linewidth=3)
plt.axis([0, 1000, 0, 1])
plt.xlabel("Dimensions")
plt.ylabel("Explained Variance")
plt.title("HCC1806 DropSeq")
plt.plot([d, d], [0, 0.95], "k:")
plt.plot([0, d], [0.95, 0.95], "k:")
plt.plot(d, 0.95, "ko")
# plt.annotate("Elbow", xy=(65, 0.85), xytext=(70, 0.7), arrowprops=dict(arrowstyle="->"), fontsize=16)
plt.grid(True)
# save_fig("explained_variance_plot")
plt.show()
Plotting the PCA results using the first three dimensions, for the SmartSeq experiment we can actually start to see that hypoxia and normoxia cells could be clustered, with the exception of some outliers. On the contrary, this is less evident with the plotting of the DropSeq experiment.
MCF7_colors = []
for c in MCF7_hyponormo:
if c == 1:
MCF7_colors.append("r")
else:
MCF7_colors.append("b")
print(len(MCF7_colors))
# MCF7_low SmartSeq
ax = plt.axes(projection='3d')
ax.scatter3D(MCF7_low[:, 0], MCF7_low[:, 1], MCF7_low[:, 2], c=MCF7_colors)
plt.title("Red are normoxic, blue are hypoxic")
plt.show()
HCC1806_colors = []
for c in HCC1806_hyponormo:
if c == 1:
HCC1806_colors.append("r")
else:
HCC1806_colors.append("b")
print(len(HCC1806_colors))
# HCC1806_low SmartSeq
%matplotlib inline
ax = plt.axes(projection='3d')
ax.scatter3D(HCC1806_low[:, 0], HCC1806_low[:, 1], HCC1806_low[:, 2], c=HCC1806_colors)
plt.title("Red are normoxic, blue are hypoxic")
plt.show()
MCF7d_colors = []
for c in MCF7_hyponorm:
if c == 1:
MCF7d_colors.append("r")
else:
MCF7d_colors.append("b")
print(len(MCF7d_colors))
# MCF7_lowd (Dropseq)
%matplotlib inline
ax = plt.axes(projection='3d')
ax.scatter3D(MCF7_lowd[:, 0], MCF7_lowd[:, 1], MCF7_lowd[:, 2], c=MCF7d_colors, s=0.5, edgecolor=(0,0,0,0))
#ax.view_init(120, 120)
plt.title("Red are normoxic, blue are hypoxic")
plt.show()
HCC1806d_colors = []
for c in HCC1806_hyponorm:
if c == 1:
HCC1806d_colors.append("r")
else:
HCC1806d_colors.append("b")
print(len(HCC1806d_colors))
# HCC1806_f_n_drop_lowd (DropSeq)
ax = plt.axes(projection='3d')
ax.scatter3D(HCC1806_f_n_drop_lowd[:, 0], HCC1806_f_n_drop_lowd[:, 1], HCC1806_f_n_drop_lowd[:, 2], c=HCC1806d_colors, edgecolors=(0,0,0,0))
plt.title("Red are normoxic, blue are hypoxic")
plt.show()
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
k = 2
kmeans = KMeans(n_clusters=4, random_state=42)
y_pred = kmeans.fit_predict(MCF7_f_n.T)
#np.argwhere(y_pred != 0)
We decided to perform Gaussian Model Mixture, in order to see if an unsupervised learning model, based on the probability of different gaussian distribution, (in this case 2), would have found indipendently the two clusters we are dealing with: normoxia and hypoxia state of cells.
MCF7_3D = np.array([MCF7_low[:, 0], MCF7_low[:, 1], MCF7_low[:, 2]])
gm = GaussianMixture(n_components=2, random_state=42).fit(MCF7_3D.T)
y_pred = gm.predict(MCF7_3D.T)
y_pred
mapping = {}
for class_id in np.unique(MCF7_hyponormo):
mode, _ = stats.mode(y_pred[MCF7_hyponormo==class_id]) # look at all samples with a given label and study what is the most common label given by the Gaussian mixture
print(stats.mode(y_pred[MCF7_hyponormo ==class_id]))
mapping[mode[0]] = class_id
mapping
y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])
y_pred
print("correct predictions: ", np.sum(y_pred==MCF7_hyponormo))
print("fraction of correct predictions", np.sum(y_pred==MCF7_hyponormo) / len(y_pred))
array_errors = []
for i, j, k in zip( y_pred, MCF7_hyponormo, range(len(y_pred))):
if i!=j:
array_errors.append(MCF7_3D[:,k])
len(array_errors)
ax = plt.axes(projection='3d')
plt.figure(figsize=(50,50))
ax.scatter3D(MCF7_3D[0], MCF7_3D[1], MCF7_3D[2], c=MCF7_colors)
ax.scatter3D(array_errors[0], array_errors[1], c= "r")
ax.set_title("Red are normoxic, blue are hypoxic")
plt.show()
For the only purpose to have a nice plot, we applied GMM on the dataset constituted by the first 3 dimensions of PCA ran on the MCF7 f normalized and smartseq. (Notice that, we did so since we previously commented and observed that in this case the first 3 dimensions explain a lot of the variance).
For the sake of clarity and better understanding on the problem, by looking at the graph we can see that the two states are enoughly divided in two gaussian bubbles, of course relative to the translation in this subspace we performed.
We highlighted in red the missclassified cells, later on in the classificator we will take them in special consideration, to double check if our classificator will perform better than this GMM distinction.
As we can see GMM perform a distinction betweeen normoxia and hypoxia pretty accurate: 98% of the predictions are correct and only 5 cells in total are misclassified. This could be due to numerical problems of GMM or either by lab problems. In fact we should keep in mind that while performing the sequentiation some cells could change state (from normoxia to hypoxia) based on environment state (i. e. cells put at the margin during the sequentiation.
gm_whole = GaussianMixture(n_components=2, random_state=42, reg_covar=1e-5).fit(MCF7_f_n.T)
y_pred = gm_whole.predict(MCF7_f_n.T)
y_pred.shape
from scipy import stats
mapping = {}
for class_id in np.unique(MCF7_hyponormo):
mode, _ = stats.mode(y_pred[MCF7_hyponormo==class_id]) # look at all samples with a given label and study what is the most common label given by the Gaussian mixture
print(stats.mode(y_pred[MCF7_hyponormo ==class_id]))
mapping[mode[0]] = class_id
mapping
y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])
y_pred
print("correct predictions: ", np.sum(y_pred==MCF7_hyponormo))
print("fraction of correct predictions", np.sum(y_pred==MCF7_hyponormo) / len(y_pred))
array_errors = []
arg_where_errors = []
for i, j, k in zip( y_pred, MCF7_hyponormo, range(len(y_pred))):
if i!=j:
array_errors.append(MCF7_f_n.iloc[:,k])
arg_where_errors.append(k)
print(arg_where_errors)
position_all_errors ={}
position_all_errors["MCF7_f_n"] = arg_where_errors
Now performing GMM on the whole dataset (taking in consideration all 3000 features) we still obtain quite good results.
In this case the plot is not available, but as before we keep track of the missclassified cells to check their outcome in the classification we will build.
By checking the behaviour of the classifier we will be able to deduce if the missclassification was due to the GMM method or to a "lab problem" as explained before.
In the case of the cell line HCC1806 sequentiated trought Smart Seq technique, we will not perform GMM on the 3D data sets, because the as we already commented the PCA's first 3 dimensions didn't explained much of the variance. So in order to avoid errors due to the lack of informations given by these 3 dimensions, we will skip this part and we will proceed without a graphical insight of the problem
gm = GaussianMixture(n_components=2, random_state=42).fit(HCC1806_f_n.T)
y_pred = gm.predict(HCC1806_f_n.T)
y_pred.shape
mapping = {}
for class_id in np.unique(HCC1806_hyponormo):
mode, _ = stats.mode(y_pred[HCC1806_hyponormo==class_id]) # look at all samples with a given label and study what is the most common label given by the Gaussian mixture
print(stats.mode(y_pred[HCC1806_hyponormo ==class_id]))
mapping[mode[0]] = class_id
mapping
y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])
y_pred
print("correct predictions: ", np.sum(y_pred==HCC1806_hyponormo))
print("fraction of correct predictions", np.sum(y_pred==HCC1806_hyponormo) / len(y_pred))
array_errors = []
argwhere_errors = []
for i, j, k in zip( y_pred, HCC1806_hyponormo, range(len(y_pred))):
if i!=j:
array_errors.append(HCC1806_f_n.iloc[:,k])
argwhere_errors.append(k)
print(argwhere_errors)
position_all_errors["HCC1806"] = argwhere_errors
In this case we can see that GMM performed slightly worse than in the previous case, predicting only 93% of correct results.
Again, we will keep track of the errors and, for the same reasons explained before we will check them in the classifier.
comments on the GMM on dropseq
We tried to apply this unsupervised classification method, based on the gaussian probability distribution also on the set cell lines sequentiated with the technique dropseq.
Aim: We are willing to predict whether a cell is in the ‘hypoxia’ or ‘normoxia’ condition based on the specific expressed genes by a given cell line.
Encoding: In our setting, we will refer to the conditions as ‘0’ for hypoxia and ‘1’ for normoxia.
As the overarching theme of our draft for a classifier, we have always stored 10% of the data points away from the training set. In this manner, we will remove any further bias from the analysis during the actual testing procedure.
Having successfully carried the Exploratory Data Analysis (EDA) part, we have gained relevant insights on the dataset which can be used for the classification part. Under this assumption, an initial idea was to implement a Decision Forest so that we could rank the genes available according to the ‘information gain’ criteria. Indeed, using ‘entropy’ as a criterion for the splitting and pruning in the algorithm we could yield a high cross-validation score.
Nonetheless, our final take was to cross-check our assumptions by running other classifiers and, instead of coming down to a final choice, undertake a hard-voting system which could choose the most suitable one. Among the choices available, we have run a Support Vector Machine – using a radial kernel for a greater granular classification and a 50% maximum margin of error – as well as a Logistic Regression. In the latter, a L1 penalty term regularizer yielded the most accurate results yet, given that each competing model made (few) errors on different cells, we were convinced by hard-voting as the best course of action.
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import KMeans , AgglomerativeClustering
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import NuSVC
from sklearn.linear_model import LogisticRegression
Reading the files:
# HCC1806_f_n =pd.read_csv('HCC1806_SmartS_Filtered_Normalised_3000_Data_train.txt',delimiter='\ ',engine='python',index_col=0)
# MCF7_f_n = pd.read_csv('MCF7_SmartS_Filtered_Normalised_3000_Data_train.txt',delimiter='\ ',engine='python',index_col=0)
Marking hypoxia as (0) or normoxia as (1) as usual:
index = MCF7_f_n.T.index
MCF7_hyponormo = np.zeros((len(index)),dtype = 'int64')
for i in range(len(index)):
MCF7_hyponormo[i] = not 'Hypo' in index[i] # 0 Hypo 1 Norm
index = HCC1806_f_n.T.index
HCC1806_hyponormo = np.zeros((len(index)),dtype = 'int64')
for i in range(len(index)):
HCC1806_hyponormo[i] = not 'Hypo' in index[i] # 0 Hypo 1 Norm
Splitting in test and training set:
MCF7_training, MCF7_testing, MCF7_training_target , MCF7_testing_target = train_test_split(MCF7_f_n.T, MCF7_hyponormo, test_size= 0.1, random_state= 42)
HCC1806_training, HCC1806_testing, HCC1806_training_target , HCC1806_testing_target = train_test_split(HCC1806_f_n.T, HCC1806_hyponormo, test_size= 0.1, random_state= 42)
We want to emphasize that, in the full code of our analysis, we will not touch the test set created in this instance. In this manner, we will exclude further bias in our classification algorithms and the test set accuracy will be statistically significant.
First of all, we start by training a random forest classifier so that we are able to select the suitable cell line.
The random forest is run on the merged datasets of MCF7 and HCC1806, both filtered and normalized.
The dependent variable where we will fit our random forest to is an array made of 0s if the row belongs to MCF7, of 1s for HCC1806
MH_f_n = MCF7_f_n.join(HCC1806_f_n, how="outer")
MH_f_n_no = MH_f_n.fillna(0)
MH_f_n_no.T
MC_HC_target =np.concatenate((np.zeros(250, dtype = 'int64'), np.ones(182,dtype = 'int64'))) #0 MCF7 #1 HCC1806
My_cell = RandomForestClassifier(criterion= 'entropy', random_state= 42)
My_cell.fit(MH_f_n_no.T, MC_HC_target)
XHM_train, XHM_test, yHM_train, yHM_test = train_test_split(MH_f_n_no.T , MC_HC_target, test_size=0.2, shuffle = True, random_state=42)
As shown, the predictions on the Random Forest Classifier on the test set yield an accuracy of 1.0, so the classifier is able to distinguish the cells coming from SmartSeq.
My_cell.fit(XHM_train,yHM_train)
accuracy_score(My_cell.predict(XHM_test), yHM_test)
We will now run a random forest on MCF7 with information gain entropy as a criterion for choosing the features in splitting. It yields an accuracy of 99.5% on the test set.
We also use cross-validation.
from sklearn.model_selection import cross_val_score
my_classM = RandomForestClassifier( criterion= 'entropy', random_state= 42)
scoresD = cross_val_score(my_classM, MCF7_training, MCF7_training_target, cv=10)
print(scoresD.mean())
my_classM.fit(MCF7_training, MCF7_training_target)
accuracy_score(MCF7_testing_target, my_classM.predict(MCF7_testing))
On HCC1806 we have a mean of the scores a bit lower, at 98.7%. We keep using the 10-fold cross-validation to achieve the optimal parameters.
my_classH = RandomForestClassifier(criterion= 'entropy', random_state= 42)
scoresH = cross_val_score(my_classH,HCC1806_training, HCC1806_training_target, cv=10)
print(scoresH.mean())
my_classH.fit(HCC1806_training, HCC1806_training_target)
accuracy_score(HCC1806_testing_target, my_classH.predict(HCC1806_testing))
Subsequently, we run a Nu-Support Vector Classification as a further classifier. 'nu' is an upper bound on the fraction of margin errors and a lower bound of the fraction of support vectors, which should be in the interval (0, 1].
We also use a non-linear kernel and a 10-fold cross validation. The accuracy is 95.77%, and we furthermore check that it works as planned on the testing set and its target.
my_svmH = NuSVC(nu = 0.3)
scores = cross_val_score(my_svmH, HCC1806_training, HCC1806_training_target, cv=10)
print(scores.mean())
my_svmH.fit(HCC1806_training,HCC1806_training_target)
accuracy_score(my_svmH.predict(HCC1806_testing), HCC1806_testing_target)
With the same argument, we run a SVM classifier on MCF7 and obtain a higher accuracy score, int the same fashion we had for other classifiers when comparing MCF7 with HCC1806.
my_svmM = NuSVC(nu = 0.3)
scoresM = cross_val_score(my_svmM, MCF7_training, MCF7_training_target, cv=10)
print(scoresM.mean())
my_svmM.fit(MCF7_training, MCF7_training_target)
accuracy_score(my_svmM.predict(MCF7_testing), MCF7_testing_target )
We also run a Logistic Regression on MCF7 and HCC1806 as a further classifier. Most importantly, we use a l1-penalty regularizer with the ‘solver’ set to 'liblinear'.
my_logM = LogisticRegression(solver = "liblinear", penalty="l1", random_state=42)
scoresM = cross_val_score(my_logM, MCF7_training, MCF7_training_target, cv=10)
print(scoresM.mean())
my_logM.fit(MCF7_training, MCF7_training_target)
accuracy_score(my_logM.predict(MCF7_testing), MCF7_testing_target )
The Logistic Regression is slightly less effective on HCC1806.
my_logH = LogisticRegression(solver = "liblinear", penalty="l1", random_state=42)
scoresH = cross_val_score(my_logH, HCC1806_training, HCC1806_training_target, cv=10)
print(scoresH.mean())
my_logH.fit(HCC1806_training, HCC1806_training_target)
accuracy_score(my_logH.predict(HCC1806_testing), HCC1806_testing_target )
As a next step, we will compare the various classifiers according to the dataset we have feeded into them and the accuracy score we obtain. We have the option of choosing between a hard, binary vote system or a soft one.
If ‘hard’, the class object uses the predicted class labels for majority rule voting. Else, if ‘soft’, it predicts the class label based on the argmax of the sums of the predicted probabilities.
We also have the option to choose the sequence of weights to weight the occurrences of predicted class labels, but since it is set to None, we use uniform weights.
votingM = VotingClassifier(estimators=[('lr', my_logM), ('rf', my_classM), ('svc', my_svmM)],
voting = "hard", weights= None)
votingM.fit(MCF7_training, MCF7_training_target)
accuracy_score(votingM.predict(MCF7_testing), MCF7_testing_target )
votingH = VotingClassifier(estimators=[('lr', my_logH), ('rf', my_classH), ('svc', my_svmH)],
voting = "hard", weights= None)
votingH.fit(HCC1806_training, HCC1806_training_target)
accuracy_score(votingH.predict(HCC1806_testing), HCC1806_testing_target )
It is now time to gather all the classifier we have done into a single, overarching function.
The function takes as input the test dataset given by the professor, and returns whether each single cell, according to its gene characteristics, belongs to the normoxia or hypoxia class.
We have thus decided to create a stratified classifier, where we will use different classifier according to the cell which is identified in the input, whether it belongs to MCF7 or HCC1806.
Initializing a suitable dataframe for the input:
# fill row
def fill_row(x):
x = x.to_frame()
x1 = x.join(MH_f_n_no, lsuffix="z", how = 'outer')
x1_no = x1.fillna(0)
x1_no = x1_no.iloc[:,0]
cell = pd.DataFrame(x1_no.values.reshape(-1, 1), index = x1_no.index.values)
return cell.T
# FOR A SINGLE CELL
def classifier_smartseq(x):
cell = pd.DataFrame(x.values.reshape(-1, 1), index = x.index.values).T
x1 = fill_row(x)
if My_cell.predict(x1)[0]: # 1 = HCC1806
"""print(" the cell is of line HCC1806")
if votingH.predict(cell)[0]: # 1 Norm
#print(" the cell is in Normoxia state")
else:
#print(" the cell is in Hypoxia state")"""
return ["HCC1806", votingH.predict(cell)[0],"Norm" if votingH.predict(cell)[0] else "Hypo" ]
else: # 0 = MCF7
""" #print(" the cell is of line MCF7")
if votingM.predict(cell)[0]: # 1 = Norm
#print(" the cell is in Normoxia state")
else:
#print(" the cell is in Hypoxia state")"""
return ["MCF7", votingM.predict(cell)[0],"Norm" if votingM.predict(cell)[0] else "Hypo" ]
Creating a dictionary with the results:
### classifier for data frame
def classifier_smartseq_dataframe(df):
results = {}
for column in df:
results[str(column)] = classifier_smartseq(df[column])
results = pd.DataFrame.from_dict(results)
results = results.rename(index={0:'class',1: "state", 2: "state_name"})
return results
trial = pd.read_csv("XCells_SmartS_Filtered_Normalised_3000_Data_test_anonim.txt",delimiter='\ ',engine='python',index_col=0)
q = classifier_smartseq_dataframe(trial)
q
anonimsM = pd.read_csv("/Users/mariamorandini/Downloads/MCF7_SmartS_Filtered_Normalised_3000_Data_test_anonim (1).txt",delimiter='\ ',engine='python',index_col=0)
anonimsH = pd.read_csv("HCC1806_SmartS_Filtered_Normalised_3000_Data_test_anonim.txt",delimiter='\ ',engine='python',index_col=0)
H = classifier_smartseq_dataframe(anonimsH)
M = classifier_smartseq_dataframe(anonimsM)
M
H
H.to_excel("output_hcc1806_smartseq.xlsx")
M.to_excel("output_mcf7_smartseq.xlsx")
After submitting the predictions for the SmartSeq dataset, we got the following results:
For MCF7:\ Out of 63 total cases, we predicted correctly 100% of them.\ 32/32 normoxic cells were correclty classified.\ 31/31 hypoxic cells were correctly classified.
For HCC1806:\ Out of 45 cells, we predicted correctly 95.556 of them.\ 24/26 normoxic cells were correctly classified.\ 19/19 hypoxic cells were correctly classified.\ 2 cells were classified wrongly as hypoxic when they were actually normoxic.
In the DropSeq dataset, a first idea to run the same classifiers was eventually dropped out due to the computational power required to sift through a much higher dimensional dataset. Albeit we left these algorithms free to run for the reader (it will take time!) – without being able to cross validate our results - eventually we opted for a Multi-Layer Perceptron.
Our Neural Network, therefore, will consist of 5 layers which use the ‘SoftMax’ approach. SoftMax converts a vector of values to a probability distribution. The elements of the output vector are in range (0, 1) and sum to 1, where each vector is handled independently. The axis argument in Keras sets which axis of the input the function is applied along.
Overall, this classifier yields a better performance compared the other algorithms we had used thus far.
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import KMeans , AgglomerativeClustering
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import NuSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
# MCF7_f_n_drop = pd.read_csv('MCF7_Filtered_Normalised_3000_Data_train.txt', delimiter = '\ ', index_col=0,engine='python')
# Added the following line to use the upload from before
MCF7_f_n_drop = MCF7_drop_f_n
MCF7_f_n_drop.columns = [str(i)+"_m" for i in MCF7_f_n_drop.columns]
# HCC1806_f_n_drop = pd.read_csv('HCC1806_Filtered_Normalised_3000_Data_train.txt', delimiter = '\ ', engine = 'python', index_col= 0)
# Added the following line to use the upload before
HCC1806_f_n_drop = HCC1806_drop_f_n
index = HCC1806_f_n_drop.T.index
HCC1806_hyponormo_drop = np.zeros((len(index)),dtype = 'int64')
for i in range(len(index)):
HCC1806_hyponormo_drop[i] = not 'Hypo' in index[i] # 0 Hypo 1 Norm
index = MCF7_f_n_drop.T.index
MCF7_hyponormo_drop = np.zeros((len(index)),dtype = 'int64')
for i in range(len(index)):
MCF7_hyponormo_drop[i] = not 'Hypo' in index[i] # 0 Hypo 1 Norm
MCF7_drop_training, MCF7_drop_testing, MCF7_drop_training_target , MCF7_drop_testing_target = train_test_split(MCF7_f_n_drop.T, MCF7_hyponormo_drop, test_size= 0.1, random_state= 42)
HCC1806_drop_training, HCC1806_drop_testing, HCC1806_drop_training_target , HCC1806_drop_testing_target = train_test_split(HCC1806_f_n_drop.T, HCC1806_hyponormo_drop, test_size= 0.1, random_state= 42)
## here we train a random forest to being able to select the cell line
MH_f_n_drop = MCF7_f_n_drop.join(HCC1806_f_n_drop, how="outer")
MH_f_n_no_drop = MH_f_n_drop.fillna(0)
MH_f_n_no_drop.T
MC_HC_target_drop =np.concatenate((np.zeros(len(MCF7_hyponormo_drop), dtype = 'int64'), np.ones(len(HCC1806_hyponormo_drop),dtype = 'int64'))) #0 MCF7 #1 HCC1806
XHMd_train, XHMd_test, yHMd_train, yHMd_test = train_test_split(MH_f_n_no_drop.T , MC_HC_target_drop, test_size=0.2, shuffle = True, random_state=42)
My_cell_d = RandomForestClassifier(criterion= 'entropy', random_state= 42)
scoresD = cross_val_score(My_cell_d, XHMd_train, yHMd_train, cv=10)
print(scoresD.mean())
My_cell_d.fit(MH_f_n_no_drop.T, MC_HC_target_drop)
accuracy_score(My_cell_d.predict(XHMd_test),yHMd_test)
Remark: In the following section are reported the cells to run the canonical classifiers on dropseq, we still decided to share them as this was our first approach. We then decided to use a clever and faster way to classifiy these larger data sets, that is trought the use of neural networks. We thus warn that the run of these cells might take a huge amount of time, so feel free to skip them.
my_classM = RandomForestClassifier( criterion= 'entropy', random_state= 21 )
scoresD = cross_val_score(my_classM, MCF7_drop_training, MCF7_drop_training_target, cv=10)
print(scoresD.mean())
my_classM.fit(MCF7_drop_training, MCF7_drop_training_target)
accuracy_score(MCF7_drop_testing_target, my_classM.predict(MCF7_drop_testing))
my_classHd = RandomForestClassifier( criterion= 'entropy', random_state= 21 )
scoresD = cross_val_score(my_classHd, HCC1806_drop_training, HCC1806_drop_training_target, cv=10)
print(scoresD.mean())
my_classHd.fit(HCC1806_drop_training, HCC1806_drop_training_target)
accuracy_score(HCC1806_drop_testing_target, my_classHd.predict(HCC1806_drop_testing))
my_svmMd = NuSVC(nu = 0.3)
# scoresM = cross_val_score(my_svmMd, MCF7_drop_training, MCF7_drop_training_target, cv=10)
# print(scoresM.mean())
my_svmMd.fit(MCF7_drop_training, MCF7_drop_training_target)
accuracy_score(my_svmMd.predict(MCF7_drop_testing), MCF7_drop_testing_target )
my_svmHd = NuSVC(nu = 0.3)
# scores = cross_val_score(my_svmH, HCC1806_drop_training, HCC1806_drop_training_target, cv=10)
# print(scores.mean())
my_svmHd.fit(HCC1806_drop_training,HCC1806_drop_training_target)
#my_svmH.decision_function
accuracy_score(my_svmHd.predict(HCC1806_drop_testing), HCC1806_drop_testing_target )
my_logMd = LogisticRegression(solver = "liblinear", penalty="l1", random_state=42)
my_logMd.fit(MCF7_drop_training, MCF7_drop_training_target)
accuracy_score(my_logMd.predict(MCF7_drop_testing), MCF7_drop_testing_target )
my_logHd = LogisticRegression(solver = "liblinear", penalty="l1", random_state=42)
my_logHd.fit(HCC1806_drop_training, HCC1806_drop_training_target)
accuracy_score(my_logHd.predict(HCC1806_drop_testing), HCC1806_drop_testing_target )
In this setting, we use the sparse categorical cross entropy method which expresses the format in which we mention true labels. If they are integers, you usually use sparse_categorical_crossentropy.
One advantage of using sparse categorical cross entropy is it saves time in memory as well as computation because it simply uses a single integer for a class, rather than a whole vector.
Moreover, we use the 'accuracy' metrics to calculate how often predictions equal labels. This metric creates two local variables, total and count that are used to compute the frequency with which y_pred matches y_true. This frequency is ultimately returned as binary accuracy: an idempotent operation that simply divides total by count.
XH_train, XH_test, yH_train, yH_test = train_test_split(HCC1806_drop_training,HCC1806_drop_training_target, test_size=0.2, random_state=42)
modelH = keras.models.Sequential()
modelH.add(keras.layers.Flatten(input_shape=[3000]))
modelH.add(keras.layers.Dense(190, activation="relu"))
modelH.add(keras.layers.Dense(120, activation="relu"))
modelH.add(keras.layers.Dense(20, activation="relu"))
modelH.add(keras.layers.Dense(2, activation = 'softmax'))
modelH.compile(loss="sparse_categorical_crossentropy",
optimizer="sgd",
metrics=["accuracy"])
Here, we use the ModelCheckpoint callback which is used in conjunction with training using model.fit() to save a model or weights (in a checkpoint file) at some interval, so the model or weights can be loaded later to continue the training from the state saved.
Also, we use EarlyStopping to stop training when a monitored metric has stopped improving.
mc = keras.callbacks.ModelCheckpoint('best_model_H.h5', monitor='val_accuracy', mode='max', save_best_only=True)
es=keras.callbacks.EarlyStopping(monitor='val_accuracy',mode='max', patience=12)
modelHC = modelH.fit(XH_train, yH_train, epochs= 30,callbacks = [es,mc],
validation_data= (XH_test, yH_test))
from keras.models import load_model
HCC_nn_bestmodel = load_model('best_model_H.h5')
c = HCC_nn_bestmodel.predict(HCC1806_drop_testing)
accuracy_score(c.argmax(1), HCC1806_drop_testing_target)
XM_train, XM_test, yM_train, yM_test = train_test_split(MCF7_drop_training,MCF7_drop_training_target, test_size=0.2, random_state=42)
model = keras.models.Sequential()
model.add(keras.layers.Flatten(input_shape=[3000]))
model.add(keras.layers.Dense(150, activation="relu"))
model.add(keras.layers.Dense(100, activation="relu"))
model.add(keras.layers.Dense(40, activation="relu"))
model.add(keras.layers.Dense(2, activation = 'softmax'))
model.compile(loss="sparse_categorical_crossentropy",
optimizer="sgd",
metrics=["accuracy"])
mc = keras.callbacks.ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', save_best_only=True)
es=keras.callbacks.EarlyStopping(monitor='val_accuracy',mode='max', patience=12)
modelM = model.fit(XM_train, yM_train, epochs=30, callbacks= [es, mc] ,
validation_data= (XM_test, yM_test))
MCF7_nn_best = load_model('best_model.h5')
g = MCF7_nn_best.predict(MCF7_drop_testing)
accuracy_score(g.argmax(1), MCF7_drop_testing_target)
After submitting the predictions for the DropSeq dataset, we got the following results:
For MCF7:\ Out of 5406 cells, we predicted correctly 98.02% of them.\ 3163/3215 normoxic cells were correctly classified.\ 2136/2191 hypoxic cells were correctly classified.\ 52 cells were classified wrongly as hypoxic when they were actually normoxic.\ 55 cells were classified wrongly as normoxic when they were actually hypoxic.
For HCC1806:\ Out of 3671 cells, we predicted correctly 95.45 of them.\ 1388/1454 normoxic cells were correctly classified.\ 2116/2217 hypoxic cells were correctly classified.\ 66 cells were classified wrongly as hypoxic when they were actually normoxic.\ 101 cells were classified wrongly as normoxic when they were actually hypoxic.
Given the task at hand, we build a final classifier so that it can sort our cells based on the ‘cell_type’ parameter, since we do not know a priori the cell line we should receive in the test set. For this reason, we have decided to employ a Random Forest to get a peek on the most expressed genes (as aforementioned). We then sort according to which genes are most expressed in a given cell line and which are the least expressed.
Finally, we coded an overarching function which can take a completely general input from all the dataset and returns the adequate classifier by recognize its origin.
# fill row
def fill_row_d(x):
x = x.to_frame()
x1 = x.join(MH_f_n_no_drop, lsuffix="z", how = 'outer')
x1_no = x1.fillna(0)
x1_no = x1_no.iloc[:,0]
cell = pd.DataFrame(x1_no.values.reshape(-1, 1), index = x1_no.index.values)
return cell.T
# FOR A SINGLE CELL
def classifier_dropseq_cell(x):
cell = pd.DataFrame(x.values.reshape(-1, 1), index = x.index.values).T
x1 = fill_row_d(x)
if My_cell_d.predict(x1)[0]: # 1 = HCC1806
# NN for HCC1806+
return ["HCC1806", (HCC_nn_bestmodel.predict(cell)).argmax(1)[0],"Norm" if (HCC_nn_bestmodel.predict(cell)).argmax(1)[0] else "Hypo" ]
else: # 0 = MCF7
# NN for MCF7
return ["MCF7", (MCF7_nn_best.predict(cell)).argmax(1)[0],"Norm" if (MCF7_nn_best.predict(cell)).argmax(1)[0] else "Hypo" ]
# NN result for mcf7
def classifier_mcf7_d(x):
pred = MCF7_nn_best.predict(x.T).argmax(1)
res = []
for i in pred:
res.append(["MCF7", i, "norm" if i else "hypo"])
res = pd.DataFrame(res).T
res = res.rename(index ={0:'class',1: "state", 2: "state_name"})
return res
# NN result for HCC1806
def classifier_hcc1806_d(x):
pred = HCC_nn_bestmodel.predict(x.T).argmax(1)
res = []
for i in pred:
res.append(["HCC1806", i, "norm" if i else "hypo"])
res = pd.DataFrame(res).T
res = res.rename(index ={0:'class',1: "state", 2: "state_name"})
return res
# general classsifier for dropseq:
# - this one runs faster but assumes that the data set is only of one kind i.e. takes one sample
# - to decide the whole classifier
def classifier_dropseq(x):
sample = x.iloc[:, 0]
sample = fill_row_d(sample)
if My_cell_d.predict(sample)[0]: # 1 = HCC1806
return classifier_hcc1806_d(x)
else: # 0 = MCF7
return classifier_mcf7_d(x)
# more general classifier- checks cell by cell - Run very slowly, to use only in the case of mixed
def classifier_dropseq_general(x):
results = {}
for column in x:
results[str(column)] = classifier_dropseq_cell(x[column])
results = pd.DataFrame.from_dict(results)
results = results.rename(index={0:'class',1: "state", 2: "state_name"})
return results
anonyms_Md = pd.read_csv("MCF7_Filtered_Normalised_3000_Data_test_anonim.txt", delimiter = '\ ', index_col=0,engine='python')
anonyms_Hd = pd.read_csv("HCC1806_Filtered_Normalised_3000_Data_test_anonim.txt", delimiter = '\ ', index_col=0,engine='python' )
Md = classifier_dropseq(anonyms_Md)
Hd = classifier_dropseq(anonyms_Hd)
Md
Hd
## save outputs
Hd.to_excel("output_hcc1806_dropseq.xlsx")
Md.to_excel("output_mcf7_dropseq.xlsx")